library(readxl) # for reading in excel files
library(janitor) # data checks and cleaning
library(glue) # for easy pasting
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(rstatix) # for stats
library(pheatmap) # for heatmaps
library(plotly) # for interactive plots
library(htmlwidgets) # for saving interactive plots
library(devtools)
library(notame) # used for feature clustering
library(doParallel)
library(igraph) # feature clustering
library(ggpubr) # visualizations
library(knitr) # clean table printing
library(mixOmics) # for multilevel PCAs
library(ggthemes)
library(pathview) # for functional analysis and KEGG annotation
library(ggcorrplot)
library(corrr)
library(ggthemes)
library(PCAtools)
library(tidyverse) # for everything# rename "row ID"
omicsdata <- omicsdata %>%
rename("row_ID" = `row ID`)
# how many features
nrow(omicsdata)## [1] 946
## # A tibble: 2 × 60
## mz_rt dupe_count row_ID `6101_U4_C18POS_14` `6110_U2_C18POS_34`
## <chr> <int> <dbl> <dbl> <dbl>
## 1 369.152_4.502 2 4830 40321. 3464.
## 2 369.152_4.502 2 4832 40321. 3464.
## # ℹ 55 more variables: `6104_U2_C18POS_19` <dbl>, `6109_U4_C18POS_22` <dbl>,
## # `6111_U1_C18POS_51` <dbl>, `6110_U3_C18POS_45` <dbl>,
## # `6105_U1_C18POS_43` <dbl>, `6111_U4_C18POS_42` <dbl>,
## # `6113_U2_C18POS_31` <dbl>, `6105_U2_C18POS_40` <dbl>,
## # `6109_U1_C18POS_58` <dbl>, `6113_U4_C18POS_62` <dbl>,
## # `6102_U1_C18POS_26_1` <dbl>, `6102_U2_C18POS_16` <dbl>,
## # `6105_U4_C18POS_47` <dbl>, `6105_U3_C18POS_39` <dbl>, …
Remove duplicates
# remove dupes
omicsdata <- omicsdata %>%
distinct(mz_rt, .keep_all = TRUE)
# check again for dupes
omicsdata %>% get_dupes(mz_rt)## # A tibble: 0 × 60
## # ℹ 60 variables: mz_rt <chr>, dupe_count <int>, row_ID <dbl>,
## # 6101_U4_C18POS_14 <dbl>, 6110_U2_C18POS_34 <dbl>, 6104_U2_C18POS_19 <dbl>,
## # 6109_U4_C18POS_22 <dbl>, 6111_U1_C18POS_51 <dbl>, 6110_U3_C18POS_45 <dbl>,
## # 6105_U1_C18POS_43 <dbl>, 6111_U4_C18POS_42 <dbl>, 6113_U2_C18POS_31 <dbl>,
## # 6105_U2_C18POS_40 <dbl>, 6109_U1_C18POS_58 <dbl>, 6113_U4_C18POS_62 <dbl>,
## # 6102_U1_C18POS_26_1 <dbl>, 6102_U2_C18POS_16 <dbl>,
## # 6105_U4_C18POS_47 <dbl>, 6105_U3_C18POS_39 <dbl>, …
## [1] 945
Remove weird empty column
## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
## [59] "...59"
# remove weird lgl column
omicsdata <- omicsdata %>%
dplyr::select(!where(is.logical))
colnames(omicsdata)## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
# create long df for omics df
omicsdata_tidy <- omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and omics dfs
df_combined <- full_join(omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
df_combined_sep <- df_combined %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
df_combined_sep$mz <- as.numeric(df_combined_sep$mz)
df_combined_sep$rt <- as.numeric(df_combined_sep$rt)
df_combined_sep$Subject <- as.character(df_combined_sep$Subject)
df_combined_sep$Intervention <- as.character(df_combined_sep$Intervention)
# rearrange column order
df_combined_sep <- df_combined_sep %>%
dplyr::select(sample_ID, pre_post, Intervention, everything())
str(df_combined_sep)## tibble [52,920 × 14] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:52920] "6101_U4_C18POS_14" "6110_U2_C18POS_34" "6104_U2_C18POS_19" "6109_U4_C18POS_22" ...
## $ pre_post : chr [1:52920] "post" "post" "post" "post" ...
## $ Intervention : chr [1:52920] "Yellow" "Yellow" "Yellow" "Red" ...
## $ mz : num [1:52920] 227 227 227 227 227 ...
## $ rt : num [1:52920] 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 ...
## $ row_ID : num [1:52920] 37 37 37 37 37 37 37 37 37 37 ...
## $ peak_height : num [1:52920] 82834 159688 140461 47134 83790 ...
## $ Subject : chr [1:52920] "6101" "6110" "6104" "6109" ...
## $ Period : chr [1:52920] "U4" "U2" "U2" "U4" ...
## $ sequence : chr [1:52920] "R_Y" "Y_R" "Y_R" "Y_R" ...
## $ Intervention_week: num [1:52920] 14 6 6 14 2 10 2 14 6 6 ...
## $ Sex : chr [1:52920] "F" "M" "M" "F" ...
## $ Age : num [1:52920] 58 36 54 75 46 36 40 46 61 40 ...
## $ BMI : num [1:52920] 31.1 29.9 33.1 43.3 30 ...
# plot
(plot_mzvsrt <- df_combined_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features"))# NAs in all data including QCs
NAbyRow <- rowSums(is.na(omicsdata[,-1]))
hist(NAbyRow,
breaks = 56, # because there are 56 samples, 48 samples + 8 QCs
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")# samples only (no QCs)
omicsdata_noQC <- omicsdata %>%
dplyr::select(-contains("QC"))
#NAs in samples only?
NAbyRow_noQC <- rowSums(is.na(omicsdata_noQC[,-1]))
hist(NAbyRow_noQC,
breaks = 48, # because there are 48 samples
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")Are there any missing values in QCs? There shouldn’t be after data preprocessing/filtering
omicsdata_QC <- omicsdata %>%
dplyr::select(starts_with("P"))
NAbyRow_QC <- colSums(is.na(omicsdata_QC))
# lets confirm that there are no missing values from my QCs
sum(NAbyRow_QC) # no## [1] 0
# calculate how many NAs there are per feature in whole data set
contains_NAs <- df_combined %>%
group_by(mz_rt) %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(contains_NAs)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 44
## 2 106.0133_6.888 TRUE 36
## 3 1071.4352_3.059 TRUE 44
## 4 1106.5217_3.833 TRUE 44
## 5 119.0485_0.793 TRUE 36
## 6 119.0492_3.019 TRUE 21
NAs by groups
#calculate NAs per feature in red intervention
NAs_Red_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Red") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Red_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 15
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 19
## 6 119.0492_3.019 TRUE 12
#calculate NAs per feature in yellow intervention
NAs_Yellow_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Yellow") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Yellow_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 21
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 17
## 6 119.0492_3.019 TRUE 9
#calculate NAs per feature in before both interventions
NAs_preIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "pre") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_preIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 17
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 10
#calculate NAs per feature after both interventions
NAs_postIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "post") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_postIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 19
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 11
# impute any missing values by replacing them with 1/2 of the lowest peak height value of a feature (i.e. in a row).
imputed_omicsdata <- omicsdata
imputed_omicsdata[] <- lapply(imputed_omicsdata,
function(x) ifelse(is.na(x),
min(x, na.rm = TRUE)/2, x))
dim(imputed_omicsdata)## [1] 945 58
Are there any NAs?
## [1] 0
# create long df for imputed omics df
imputed_omicsdata_tidy <- imputed_omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and imputed omics dfs
imputed_fulldata <- full_join(imputed_omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
imputed_fulldata_sep <- imputed_fulldata %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
imputed_fulldata_sep$mz <- as.numeric(imputed_fulldata_sep$mz)
imputed_fulldata_sep$rt <- as.numeric(imputed_fulldata_sep$rt)
imputed_fulldata_sep$Subject <- as.character(imputed_fulldata_sep$Subject)
imputed_fulldata_sep$Intervention <- as.character(imputed_fulldata_sep$Intervention)vignette for reference
# create features list from imputed data set to only include unique feature ID's (mz_rt), mz and RT
features <- imputed_fulldata_sep %>%
cbind(imputed_fulldata$mz_rt) %>%
rename("mz_rt" = "imputed_fulldata$mz_rt") %>%
dplyr::select(c(mz_rt, mz, rt)) %>%
distinct() # remove the duplicate rows
# create a second data frame which is just imputed_fulldata restructured to another wide format
data_notame <- data.frame(imputed_omicsdata %>%
dplyr::select(-row_ID) %>%
t())
data_notame <- data_notame %>%
tibble::rownames_to_column() %>% # change samples from rownames to its own column
row_to_names(row_number = 1) # change the feature IDs (mz_rt) from first row obs into column namesCheck structures
## 'data.frame': 945 obs. of 3 variables:
## $ mz_rt: chr "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ mz : num 227 159 175 189 189 ...
## $ rt : num 0.553 0.608 0.616 0.616 0.615 0.621 0.62 0.633 0.635 0.636 ...
## # A tibble: 945 × 3
## mz_rt mz rt
## <chr> <dbl> <dbl>
## 1 226.9516_0.553 227. 0.553
## 2 159.1492_0.608 159. 0.608
## 3 175.1442_0.616 175. 0.616
## 4 189.1684_0.616 189. 0.616
## 5 189.16_0.615 189. 0.615
## 6 156.0769_0.621 156. 0.621
## 7 170.0926_0.62 170. 0.62
## 8 136.0482_0.633 136. 0.633
## 9 151.1443_0.635 151. 0.635
## 10 137.071_0.636 137. 0.636
## # ℹ 935 more rows
## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <chr> <chr> <chr> <chr>
## 1 6101_U4_… " 82834.1800" " 40852.7970" " 13575.6550" " 19074.8930"
## 2 6110_U2_… " 159688.0300" " 17948.1500" " 21984.4790" " 17190.7050"
## 3 6104_U2_… " 140460.9000" " 32255.4550" " 14964.8470" " 20831.3890"
## 4 6109_U4_… " 47134.4200" " 63559.5400" " 52516.5600" " 24691.2460"
## 5 6111_U1_… " 83789.7700" " 131795.4400" " 47572.5400" " 31355.4630"
## 6 6110_U3_… " 115715.8700" " 71032.5500" " 14294.8420" " 27822.7420"
## 7 6105_U1_… " 141117.8600" " 72057.3500" " 44426.5080" " 28435.2440"
## 8 6111_U4_… " 103462.300" " 120798.030" " 50446.316" " 33316.652"
## 9 6113_U2_… " 121278.8000" " 92756.7900" " 37672.3800" " 34728.8440"
## 10 6105_U2_… " 92647.2340" " 98266.6800" " 55319.7970" " 26269.3400"
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <chr>, `156.0769_0.621` <chr>,
## # `170.0926_0.62` <chr>, `136.0482_0.633` <chr>, `151.1443_0.635` <chr>,
## # `137.071_0.636` <chr>, `182.0574_0.654` <chr>, `162.1126_0.642` <chr>,
## # `76.0757_0.642` <chr>, `114.0669_0.645` <chr>, `227.1255_0.647` <chr>,
## # `193.1547_0.646` <chr>, `219.1705_0.65` <chr>, `163.1243_0.654` <chr>,
## # `213.1234_0.672` <chr>, `203.1502_0.654` <chr>, `138.0551_0.654` <chr>, …
# change to results to numeric
data_notame <- data_notame %>%
mutate_at(-1, as.numeric)
tibble(data_notame)## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6101_U4_… 82834. 40853. 13576. 19075.
## 2 6110_U2_… 159688. 17948. 21984. 17191.
## 3 6104_U2_… 140461. 32255. 14965. 20831.
## 4 6109_U4_… 47134. 63560. 52517. 24691.
## 5 6111_U1_… 83790. 131795. 47573. 31355.
## 6 6110_U3_… 115716. 71033. 14295. 27823.
## 7 6105_U1_… 141118. 72057. 44427. 28435.
## 8 6111_U4_… 103462. 120798. 50446. 33317.
## 9 6113_U2_… 121279. 92757. 37672. 34729.
## 10 6105_U2_… 92647. 98267. 55320. 26269.
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <dbl>, `156.0769_0.621` <dbl>,
## # `170.0926_0.62` <dbl>, `136.0482_0.633` <dbl>, `151.1443_0.635` <dbl>,
## # `137.071_0.636` <dbl>, `182.0574_0.654` <dbl>, `162.1126_0.642` <dbl>,
## # `76.0757_0.642` <dbl>, `114.0669_0.645` <dbl>, `227.1255_0.647` <dbl>,
## # `193.1547_0.646` <dbl>, `219.1705_0.65` <dbl>, `163.1243_0.654` <dbl>,
## # `213.1234_0.672` <dbl>, `203.1502_0.654` <dbl>, `138.0551_0.654` <dbl>, …
connection <- find_connections(data = data_notame,
features = features,
corr_thresh = 0.9,
rt_window = 1/60,
name_col = "mz_rt",
mz_col = "mz",
rt_col = "rt")## [1] 100
## [1] 200
## [1] 300
## [1] 400
## [1] 500
## [1] 600
## [1] 700
## [1] 800
## [1] 900
## x y cor rt_diff mz_diff
## 1 151.1443_0.635 76.0757_0.642 0.9772910 0.007 -75.0686
## 2 182.0574_0.654 287.1967_0.655 0.9865474 0.001 105.1393
## 3 114.0669_0.645 227.1255_0.647 0.9689492 0.002 113.0586
## 4 219.1705_0.65 145.1054_0.656 0.9099522 0.006 -74.0651
## 5 144.1023_0.656 145.1054_0.656 0.9856422 0.000 1.0031
## 6 343.1668_0.698 258.1241_0.69 0.9555601 -0.008 -85.0427
## 113 components found
##
## Component 100 / 113
## 37 components found
##
## 12 components found
##
## 9 components found
##
## 2 components found
##
## 1 components found
# assign a cluster ID to all features. Clusters are named after feature with highest median peak height
features_clustered <- assign_cluster_id(data_notame, clusters, features, name_col = "mz_rt")
# visualize clusters
#visualize_clusters(data_notame, features, clusters, min_size = 3, rt_window = 2,name_col = "mz_rt", mz_col = "mz", rt_col = "rt", file_path = "~/path/to/project/")
# lets see how many features are removed when we only keep one feature per cluster
pulled <- pull_clusters(data_notame, features_clustered, name_col = "mz_rt")
cluster_data <- pulled$cdata
cluster_features <- pulled$cfeatures
# export clustered feature list
write_csv(cluster_features,
"notame_dfs/cluster_features-_c18-pos.csv")
nrow(omicsdata) - nrow(cluster_features)## [1] 317
# transpose the full dataset back to wide so that it is more similar to the notame dataset
imputed_fulldata_wide <- imputed_fulldata %>%
dplyr::select(-"row_ID") %>%
pivot_wider(names_from = mz_rt,
values_from = peak_height)
# list of reduced features
clusternames <- cluster_features$mz_rt
#dplyr:: only the features are in the reduced list
imp_clust <- imputed_fulldata_wide[,c(names(imputed_fulldata_wide) %in% clusternames)]
# bind back sample names
imp_clust <- cbind(imputed_fulldata_wide[1], imp_clust)
ncol(imp_clust[,-1])## [1] 628
# plot new rt vs mz scatterplot post-clustering
(plot_mzvsrt_postcluster <- cluster_features %>%
ggplot(aes(x = rt,
y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features after clustering"))# plot both scatterplots to compare with and without notame clustering
(scatterplots <- ggarrange(plot_mzvsrt,
plot_mzvsrt_postcluster,
nrow = 2))# change meta data columns to character so that I can change NAs from QCs to "QC"
imp_metabind_clust <- imp_metabind_clust %>%
mutate_at(c("Subject",
"Period",
"Intervention",
"pre_post",
"sequence",
"Intervention_week",
"Sex",
"Age",
"BMI"),
as.character)
# replace NAs in metadata columns for QCs
imp_metabind_clust[is.na(imp_metabind_clust)] <- "QC"
# unite pre_post column with intervention column to create pre_intervention column
imp_metabind_clust <- imp_metabind_clust %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)
# long df
imp_metabind_clust_tidy <- imp_metabind_clust %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund")
# structure check
str(imp_metabind_clust_tidy)## tibble [35,168 × 13] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:35168] "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" ...
## $ Subject : chr [1:35168] "6101" "6101" "6101" "6101" ...
## $ Period : chr [1:35168] "U1" "U1" "U1" "U1" ...
## $ pre_post_intervention: chr [1:35168] "pre_Red" "pre_Red" "pre_Red" "pre_Red" ...
## $ Intervention : chr [1:35168] "Red" "Red" "Red" "Red" ...
## $ pre_post : chr [1:35168] "pre" "pre" "pre" "pre" ...
## $ sequence : chr [1:35168] "R_Y" "R_Y" "R_Y" "R_Y" ...
## $ Intervention_week : chr [1:35168] "2" "2" "2" "2" ...
## $ Sex : chr [1:35168] "F" "F" "F" "F" ...
## $ Age : chr [1:35168] "58" "58" "58" "58" ...
## $ BMI : chr [1:35168] "31.0556029483653" "31.0556029483653" "31.0556029483653" "31.0556029483653" ...
## $ mz_rt : chr [1:35168] "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ rel_abund : num [1:35168] 108697 81884 16201 28592 407452 ...
imp_metabind_clust_tidy %>%
ggplot(aes(x = sample_ID, y = rel_abund, color = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_color_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Unscaled data",
y = "Relative abundance")
Will need to log transform in order to normalize and actually see the
data
imp_metabind_clust_tidy_log2 <- imp_metabind_clust_tidy %>%
mutate(rel_abund_log2 = if_else(rel_abund > 0, log2(rel_abund), 0)) %>%
replace(is.na(.), 0)(bp_data_quality <- imp_metabind_clust_tidy_log2 %>%
ggplot(aes(x = sample_ID, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Log2 transformed data",
y = "Relative abundance"))# filtered and imputed data after notame clustering, transposed
features_testforQCcorr <- t(imp_clust) %>%
as.data.frame() %>%
row_to_names(row_number = "find_header")
# log2 transform
log2_features_testforQCcorr <- features_testforQCcorr %>%
mutate_all(as.numeric) %>%
log2()
# write csv to manually edit
write.csv(log2_features_testforQCcorr,
"notame_dfs/feaures_test.csv",
row.names = TRUE)Import corrected df (edited so that mz_rt could rowname 1)
# for some reason the r is not recognizing my wd so I'll run file.choose
features_forQCcorr <- read.csv("notame_dfs/feaures_forQCcorr.csv",
header = FALSE,
row.names = 1)
features_forQCcorr <- features_forQCcorr %>%
rownames_to_column(var = "mz_rt") %>%
row_to_names(row_number = 1)%>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
write.csv(features_forQCcorr,
"notame_dfs/features_forQCcorr_pt2.csv",
row.names = TRUE)# separate sampleID and injection order
pheno_data <- imp_clust[1] %>%
separate(col = sample_ID,
into = c("sample_ID", "injection_order"),
sep = "_c18POS_")
# fix injection numbers to correct number
pheno_data[54, "injection_order"] <- 9
pheno_data[40, "injection_order"] <- 10
pheno_data[13, "injection_order"] <- 26
pheno_data[24, "injection_order"] <- 27
pheno_data[42, "injection_order"] <- 28
pheno_data[32, "injection_order"] <- 29
# make inj order column numeric
pheno_data <- pheno_data %>%
mutate_at("injection_order", as.numeric)
t_pheno_data <- as.data.frame(t(pheno_data))
write.csv(t_pheno_data,
"notame_dfs/pheno_df.csv",
row.names = TRUE)Combine pheno and feature dfs manually in excel to create metaboset df.
#make sure when converting csv to xlsx that you save as a new file, don't just change the name of the file
metaboset <- read_from_excel("notame_dfs/metaboset.xlsx",
split_by = c("column", "Ion mode"))## INFO [2025-07-31 15:58:41] Detecting corner position
## INFO [2025-07-31 15:58:41] Corner detected correctly at row 3, column D
## INFO [2025-07-31 15:58:41]
## Extracting sample information from rows 1 to 3 and columns E to BH
## INFO [2025-07-31 15:58:41] Replacing spaces in sample information column names with underscores (_)
## INFO [2025-07-31 15:58:41] Naming the last column of sample information "Datafile"
## INFO [2025-07-31 15:58:41]
## Extracting feature information from rows 4 to 631 and columns A to D
## INFO [2025-07-31 15:58:41] Creating Split column from column, Ion mode
## INFO [2025-07-31 15:58:41] Feature_ID column not found, creating feature IDs
## INFO [2025-07-31 15:58:41] Identified m/z column mass and retention time column rt
## INFO [2025-07-31 15:58:41] Identified m/z column mass and retention time column rt
## INFO [2025-07-31 15:58:41] Creating feature IDs from Split, m/z and retention time
## INFO [2025-07-31 15:58:41] Replacing dots (.) in feature information column names with underscores (_)
## INFO [2025-07-31 15:58:41]
## Extracting feature abundances from rows 4 to 631 and columns E to BH
## INFO [2025-07-31 15:58:41]
## Checking sample information
## INFO [2025-07-31 15:58:41] QC column generated from rows containing 'QC'
## INFO [2025-07-31 15:58:41] Sample ID autogenerated from injection orders and prefix ID_
## INFO [2025-07-31 15:58:41] Checking that feature abundances only contain numeric values
## INFO [2025-07-31 15:58:41]
## Checking feature information
## INFO [2025-07-31 15:58:41] Checking that feature IDs are unique and not stored as numbers
## INFO [2025-07-31 15:58:41] Checking that m/z and retention time values are reasonable
## INFO [2025-07-31 15:58:41] Identified m/z column mass and retention time column rt
## INFO [2025-07-31 15:58:41] Identified m/z column mass and retention time column rt
#construct Metaboset
modes <- construct_metabosets(exprs = metaboset$exprs,
pheno_data = metaboset$pheno_data,
feature_data = metaboset$feature_data, group_col = "Class")## Initializing the object(s) with unflagged features
## INFO [2025-07-31 15:58:41]
## Checking feature information
## INFO [2025-07-31 15:58:41] Checking that feature IDs are unique and not stored as numbers
## INFO [2025-07-31 15:58:41] Checking that feature abundances only contain numeric values
## INFO [2025-07-31 15:58:41] Setting row and column names of exprs based on feature and pheno data
# ordered by injection
(qualityBPs_b4correction <- plot_sample_boxplots(mode, order_by = "Class", title = "c18 (+) uncorrected feature abundance") +
scale_fill_manual(values = c("light gray", "deepskyblue2")))
#ordered by class
plot_sample_boxplots(mode, order_by = "Injection_order", title = "Uncorrected feature abundance") +
scale_fill_manual(values = c("light gray", "deepskyblue2"))drift corrected takes up to 2 minutes
output is percent of the features that were drift corrected. The remaining “low-quality” percent represents features for which the DC did not improve the RSD and D-ratio of the original data.
metabdata_corrected_MZ_RT <- metabdata_corrected %>%
mutate(mass = round(metabdata_corrected$mass, digits = 4), # Decrease number of decimals for m/z & rt
rt = round(metabdata_corrected$rt, digits = 3),
.before=1,
.keep="unused") %>%
unite(mz_rt, c(mass, rt), remove=TRUE) # Combine m/z & rt with _ in betweenI want the new drift corrected (DC) df to look just like “imp_metabind_clust_log2” df
# go back to wide data
imp_metabind_clust_log2 <- imp_metabind_clust_tidy_log2 %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)# combine subject and period columns from imp_metabind_clust_log2 in order to mimic DC df
subj_per_imp_metabind_clust_log2 <- imp_metabind_clust_log2 %>%
unite(subj_period, c(Subject, Period), remove = FALSE)
# place new DC observations in
DC_imp_metabind_clust_log2 <- full_join(subj_per_imp_metabind_clust_log2[,c(1:12)], metabdata_corrected_t, by = "subj_period")
# take out old QC observations
DC_imp_metabind_clust_log2 <- DC_imp_metabind_clust_log2 %>%
filter(subj_period != "QC_QC")
# replace NAs in columns for QCs
DC_imp_metabind_clust_log2[is.na(DC_imp_metabind_clust_log2)] <- "QC"Here, I’m inserting a key that would indicate identified features at level 3 ID or higher.
key_omics <- read_xlsx("../annotated-features-table.xlsx",
sheet = "Features") %>%
clean_names() %>%
filter(lc_mode_c18_hilic == "C18") %>% # LC mode
filter(esi_mode == "+") %>% # ESI mode
dplyr::select(mz_rt, annotation, metabolite_class, parent_compound) # select relevant columns
# add key columns (left_join to only keep all observations present in full feature table)
anno_imp_metabind_clust_tidy_log2 <- left_join(DC_imp_metabind_clust_tidy_log2, key_omics, by = "mz_rt") %>%
# replace NAs in feature ID columns to un-annotated feature id's (mz_rt)
mutate(Feature_ID = coalesce(annotation, mz_rt))# go back to wide data
anno_imp_metabind_clust_log2 <- anno_imp_metabind_clust_tidy_log2 %>%
dplyr::select(c(1:11),
Feature_ID, rel_abund_log2) %>%
pivot_wider(names_from = Feature_ID,
values_from = rel_abund_log2) PCA.DC_imp_metabind_clust_log2 <- PCA(anno_imp_metabind_clust_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
kable(summary(PCA.DC_imp_metabind_clust_log2))##
## Call:
## PCA(X = anno_imp_metabind_clust_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 683.902 247.574 104.066 71.119 61.213 50.743 41.782
## % of var. 42.958 15.551 6.537 4.467 3.845 3.187 2.624
## Cumulative % of var. 42.958 58.509 65.045 69.513 73.357 76.545 79.169
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 34.622 28.155 25.591 23.197 21.062 17.007 16.217
## % of var. 2.175 1.768 1.607 1.457 1.323 1.068 1.019
## Cumulative % of var. 81.344 83.112 84.720 86.177 87.500 88.568 89.587
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.253 11.014 10.922 9.426 9.061 8.556 7.560
## % of var. 0.770 0.692 0.686 0.592 0.569 0.537 0.475
## Cumulative % of var. 90.356 91.048 91.734 92.326 92.896 93.433 93.908
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.762 6.667 6.434 5.911 5.234 5.011 4.769
## % of var. 0.425 0.419 0.404 0.371 0.329 0.315 0.300
## Cumulative % of var. 94.332 94.751 95.155 95.527 95.855 96.170 96.470
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.426 4.391 4.032 3.758 3.644 3.348 3.058
## % of var. 0.278 0.276 0.253 0.236 0.229 0.210 0.192
## Cumulative % of var. 96.748 97.024 97.277 97.513 97.742 97.952 98.144
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.027 2.796 2.600 2.505 2.456 2.352 2.259
## % of var. 0.190 0.176 0.163 0.157 0.154 0.148 0.142
## Cumulative % of var. 98.334 98.510 98.673 98.831 98.985 99.133 99.274
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48 Dim.49
## Variance 2.131 1.959 1.782 1.670 1.499 1.445 0.318
## % of var. 0.134 0.123 0.112 0.105 0.094 0.091 0.020
## Cumulative % of var. 99.408 99.531 99.643 99.748 99.842 99.933 99.953
## Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55
## Variance 0.191 0.178 0.158 0.148 0.073 0.000
## % of var. 0.012 0.011 0.010 0.009 0.005 0.000
## Cumulative % of var. 99.965 99.976 99.986 99.995 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1
## 1 | 29.831 | -20.616
## 2 | 27.573 | -20.316
## 3 | 30.832 | -10.133
## 4 | 29.309 | -19.939
## 5 | 29.451 | -17.509
## 6 | 65.518 | 48.818
## 7 | 32.386 | -17.632
## 8 | 27.196 | -14.505
## 9 | 25.206 | -13.566
## 10 | 31.113 | -13.405
## ctr cos2
## 1 1.110 0.478 |
## 2 1.078 0.543 |
## 3 0.268 0.108 |
## 4 1.038 0.463 |
## 5 0.800 0.353 |
## 6 6.223 0.555 |
## 7 0.812 0.296 |
## 8 0.549 0.284 |
## 9 0.481 0.290 |
## 10 0.469 0.186 |
## Dim.2 ctr
## 1 -5.138 0.190
## 2 -5.252 0.199
## 3 -1.549 0.017
## 4 -2.943 0.062
## 5 -3.955 0.113
## 6 -34.458 8.564
## 7 -2.927 0.062
## 8 -4.137 0.123
## 9 -3.434 0.085
## 10 -1.236 0.011
## cos2 Dim.3
## 1 0.030 | -2.862
## 2 0.036 | -4.679
## 3 0.003 | -0.393
## 4 0.010 | -2.200
## 5 0.018 | -3.079
## 6 0.277 | -15.383
## 7 0.008 | -2.138
## 8 0.023 | -3.382
## 9 0.019 | 3.566
## 10 0.002 | -7.102
## ctr cos2
## 1 0.141 0.009 |
## 2 0.376 0.029 |
## 3 0.003 0.000 |
## 4 0.083 0.006 |
## 5 0.163 0.011 |
## 6 4.061 0.055 |
## 7 0.078 0.004 |
## 8 0.196 0.015 |
## 9 0.218 0.020 |
## 10 0.866 0.052 |
##
## Variables (the 10 first)
## Dim.1 ctr
## 226.9516_0.553 | -0.060 0.001
## 159.1492_0.608 | -0.520 0.040
## 175.1442_0.616 | 0.032 0.000
## 189.1684_0.616 | -0.053 0.000
## 189.16_0.615 | -0.023 0.000
## 156.0769_0.621 | -0.011 0.000
## 170.0926_0.62 | -0.104 0.002
## 136.0482_0.633 | 0.014 0.000
## 137.071_0.636 | 0.039 0.000
## 162.1126_0.642 | 0.106 0.002
## cos2 Dim.2
## 226.9516_0.553 0.014 | 0.106
## 159.1492_0.608 0.329 | -0.298
## 175.1442_0.616 0.001 | 0.341
## 189.1684_0.616 0.007 | 0.008
## 189.16_0.615 0.002 | -0.006
## 156.0769_0.621 0.000 | -0.071
## 170.0926_0.62 0.012 | -0.052
## 136.0482_0.633 0.000 | -0.037
## 137.071_0.636 0.006 | 0.141
## 162.1126_0.642 0.010 | -0.216
## ctr cos2
## 226.9516_0.553 0.005 0.043 |
## 159.1492_0.608 0.036 0.108 |
## 175.1442_0.616 0.047 0.159 |
## 189.1684_0.616 0.000 0.000 |
## 189.16_0.615 0.000 0.000 |
## 156.0769_0.621 0.002 0.007 |
## 170.0926_0.62 0.001 0.003 |
## 136.0482_0.633 0.001 0.003 |
## 137.071_0.636 0.008 0.071 |
## 162.1126_0.642 0.019 0.041 |
## Dim.3 ctr cos2
## 226.9516_0.553 0.032 0.001 0.004
## 159.1492_0.608 -0.081 0.006 0.008
## 175.1442_0.616 -0.195 0.036 0.052
## 189.1684_0.616 0.140 0.019 0.051
## 189.16_0.615 -0.058 0.003 0.011
## 156.0769_0.621 -0.027 0.001 0.001
## 170.0926_0.62 0.288 0.080 0.089
## 136.0482_0.633 0.068 0.004 0.011
## 137.071_0.636 0.012 0.000 0.001
## 162.1126_0.642 0.274 0.072 0.066
##
## 226.9516_0.553 |
## 159.1492_0.608 |
## 175.1442_0.616 |
## 189.1684_0.616 |
## 189.16_0.615 |
## 156.0769_0.621 |
## 170.0926_0.62 |
## 136.0482_0.633 |
## 137.071_0.636 |
## 162.1126_0.642 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1
## sample_ID_6101_U1_C18POS_59 | 29.831 | -20.616
## sample_ID_6101_U2_C18POS_30 | 33.739 | -15.996
## sample_ID_6101_U3_C18POS_29_1 | 28.213 | -19.582
## sample_ID_6101_U4_C18POS_14 | 29.307 | -19.516
## sample_ID_6102_U1_C18POS_26_1 | 27.573 | -20.316
## sample_ID_6102_U2_C18POS_16 | 26.916 | -15.644
## sample_ID_6102_U3_C18POS_48 | 35.629 | -13.369
## sample_ID_6102_U4_C18POS_50 | 26.846 | -15.989
## sample_ID_6103_U1_C18POS_21 | 30.832 | -10.133
## sample_ID_6103_U2_C18POS_60 | 33.584 | -14.463
## cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.478 -0.788 |
## sample_ID_6101_U2_C18POS_30 0.225 -0.612 |
## sample_ID_6101_U3_C18POS_29_1 0.482 -0.749 |
## sample_ID_6101_U4_C18POS_14 0.443 -0.746 |
## sample_ID_6102_U1_C18POS_26_1 0.543 -0.777 |
## sample_ID_6102_U2_C18POS_16 0.338 -0.598 |
## sample_ID_6102_U3_C18POS_48 0.141 -0.511 |
## sample_ID_6102_U4_C18POS_50 0.355 -0.611 |
## sample_ID_6103_U1_C18POS_21 0.108 -0.387 |
## sample_ID_6103_U2_C18POS_60 0.185 -0.553 |
## Dim.2 cos2
## sample_ID_6101_U1_C18POS_59 -5.138 0.030
## sample_ID_6101_U2_C18POS_30 -3.239 0.009
## sample_ID_6101_U3_C18POS_29_1 -4.777 0.029
## sample_ID_6101_U4_C18POS_14 -4.083 0.019
## sample_ID_6102_U1_C18POS_26_1 -5.252 0.036
## sample_ID_6102_U2_C18POS_16 -2.420 0.008
## sample_ID_6102_U3_C18POS_48 0.584 0.000
## sample_ID_6102_U4_C18POS_50 -2.253 0.007
## sample_ID_6103_U1_C18POS_21 -1.549 0.003
## sample_ID_6103_U2_C18POS_60 -1.448 0.002
## v.test Dim.3
## sample_ID_6101_U1_C18POS_59 -0.327 | -2.862
## sample_ID_6101_U2_C18POS_30 -0.206 | 8.201
## sample_ID_6101_U3_C18POS_29_1 -0.304 | -2.304
## sample_ID_6101_U4_C18POS_14 -0.260 | -2.065
## sample_ID_6102_U1_C18POS_26_1 -0.334 | -4.679
## sample_ID_6102_U2_C18POS_16 -0.154 | 7.097
## sample_ID_6102_U3_C18POS_48 0.037 | 1.997
## sample_ID_6102_U4_C18POS_50 -0.143 | -1.443
## sample_ID_6103_U1_C18POS_21 -0.098 | -0.393
## sample_ID_6103_U2_C18POS_60 -0.092 | 8.967
## cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.009 -0.281 |
## sample_ID_6101_U2_C18POS_30 0.059 0.804 |
## sample_ID_6101_U3_C18POS_29_1 0.007 -0.226 |
## sample_ID_6101_U4_C18POS_14 0.005 -0.202 |
## sample_ID_6102_U1_C18POS_26_1 0.029 -0.459 |
## sample_ID_6102_U2_C18POS_16 0.070 0.696 |
## sample_ID_6102_U3_C18POS_48 0.003 0.196 |
## sample_ID_6102_U4_C18POS_50 0.003 -0.141 |
## sample_ID_6103_U1_C18POS_21 0.000 -0.038 |
## sample_ID_6103_U2_C18POS_60 0.071 0.879 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sample_ID_6101_U1_C18POS_59 | | | 29.831 | | | -20.616 | 0.478 | -0.788 | | | -5.138 | 0.030 | -0.327 | | | -2.862 | 0.009 | -0.281 | | |
| sample_ID_6101_U2_C18POS_30 | | | 33.739 | | | -15.996 | 0.225 | -0.612 | | | -3.239 | 0.009 | -0.206 | | | 8.201 | 0.059 | 0.804 | | |
| sample_ID_6101_U3_C18POS_29_1 | | | 28.213 | | | -19.582 | 0.482 | -0.749 | | | -4.777 | 0.029 | -0.304 | | | -2.304 | 0.007 | -0.226 | | |
| sample_ID_6101_U4_C18POS_14 | | | 29.307 | | | -19.516 | 0.443 | -0.746 | | | -4.083 | 0.019 | -0.260 | | | -2.065 | 0.005 | -0.202 | | |
| sample_ID_6102_U1_C18POS_26_1 | | | 27.573 | | | -20.316 | 0.543 | -0.777 | | | -5.252 | 0.036 | -0.334 | | | -4.679 | 0.029 | -0.459 | | |
| sample_ID_6102_U2_C18POS_16 | | | 26.916 | | | -15.644 | 0.338 | -0.598 | | | -2.420 | 0.008 | -0.154 | | | 7.097 | 0.070 | 0.696 | | |
| sample_ID_6102_U3_C18POS_48 | | | 35.629 | | | -13.369 | 0.141 | -0.511 | | | 0.584 | 0.000 | 0.037 | | | 1.997 | 0.003 | 0.196 | | |
| sample_ID_6102_U4_C18POS_50 | | | 26.846 | | | -15.989 | 0.355 | -0.611 | | | -2.253 | 0.007 | -0.143 | | | -1.443 | 0.003 | -0.141 | | |
| sample_ID_6103_U1_C18POS_21 | | | 30.832 | | | -10.133 | 0.108 | -0.387 | | | -1.549 | 0.003 | -0.098 | | | -0.393 | 0.000 | -0.038 | | |
| sample_ID_6103_U2_C18POS_60 | | | 33.584 | | | -14.463 | 0.185 | -0.553 | | | -1.448 | 0.002 | -0.092 | | | 8.967 | 0.071 | 0.879 | | |
# pull PC coordinates into df
PC_coord_QC_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2$ind$coord)
# bind back metadata from cols 1-10
PC_coord_QC_log2 <- bind_cols(anno_imp_metabind_clust_log2[,1:11], PC_coord_QC_log2)
# grab some variance explained
importance_QC <- PCA.DC_imp_metabind_clust_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_withQC <- round(importance_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_withQC <- round(importance_QC[2,2], 2)Using FactoExtra package
| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 683.9018350 | 42.9578232 | 42.95782 |
| Dim.2 | 247.5739259 | 15.5508238 | 58.50865 |
| Dim.3 | 104.0663840 | 6.5367061 | 65.04535 |
| Dim.4 | 71.1186431 | 4.4671646 | 69.51252 |
| Dim.5 | 61.2125083 | 3.8449321 | 73.35745 |
| Dim.6 | 50.7432449 | 3.1873278 | 76.54478 |
| Dim.7 | 41.7815783 | 2.6244200 | 79.16920 |
| Dim.8 | 34.6220192 | 2.1747077 | 81.34391 |
| Dim.9 | 28.1545052 | 1.7684647 | 83.11237 |
| Dim.10 | 25.5909303 | 1.6074393 | 84.71981 |
| Dim.11 | 23.1967496 | 1.4570539 | 86.17686 |
| Dim.12 | 21.0622589 | 1.3229805 | 87.49984 |
| Dim.13 | 17.0067675 | 1.0682435 | 88.56809 |
| Dim.14 | 16.2173468 | 1.0186578 | 89.58675 |
| Dim.15 | 12.2532775 | 0.7696633 | 90.35641 |
| Dim.16 | 11.0141758 | 0.6918318 | 91.04824 |
| Dim.17 | 10.9220212 | 0.6860433 | 91.73428 |
| Dim.18 | 9.4259369 | 0.5920700 | 92.32635 |
| Dim.19 | 9.0611230 | 0.5691550 | 92.89551 |
| Dim.20 | 8.5555767 | 0.5374002 | 93.43291 |
| Dim.21 | 7.5595219 | 0.4748351 | 93.90774 |
| Dim.22 | 6.7618090 | 0.4247285 | 94.33247 |
| Dim.23 | 6.6673590 | 0.4187958 | 94.75127 |
| Dim.24 | 6.4343824 | 0.4041619 | 95.15543 |
| Dim.25 | 5.9106377 | 0.3712640 | 95.52669 |
| Dim.26 | 5.2343762 | 0.3287861 | 95.85548 |
| Dim.27 | 5.0110426 | 0.3147579 | 96.17024 |
| Dim.28 | 4.7692869 | 0.2995725 | 96.46981 |
| Dim.29 | 4.4255192 | 0.2779795 | 96.74779 |
| Dim.30 | 4.3907598 | 0.2757961 | 97.02359 |
| Dim.31 | 4.0323478 | 0.2532833 | 97.27687 |
| Dim.32 | 3.7576956 | 0.2360316 | 97.51290 |
| Dim.33 | 3.6443434 | 0.2289116 | 97.74181 |
| Dim.34 | 3.3478394 | 0.2102873 | 97.95210 |
| Dim.35 | 3.0581248 | 0.1920895 | 98.14419 |
| Dim.36 | 3.0269329 | 0.1901303 | 98.33432 |
| Dim.37 | 2.7960437 | 0.1756275 | 98.50995 |
| Dim.38 | 2.5999216 | 0.1633085 | 98.67326 |
| Dim.39 | 2.5045627 | 0.1573187 | 98.83057 |
| Dim.40 | 2.4556572 | 0.1542468 | 98.98482 |
| Dim.41 | 2.3524285 | 0.1477627 | 99.13258 |
| Dim.42 | 2.2591637 | 0.1419045 | 99.27449 |
| Dim.43 | 2.1308919 | 0.1338474 | 99.40834 |
| Dim.44 | 1.9585097 | 0.1230196 | 99.53136 |
| Dim.45 | 1.7816994 | 0.1119136 | 99.64327 |
| Dim.46 | 1.6695074 | 0.1048665 | 99.74814 |
| Dim.47 | 1.4992727 | 0.0941736 | 99.84231 |
| Dim.48 | 1.4451116 | 0.0907716 | 99.93308 |
| Dim.49 | 0.3180101 | 0.0199751 | 99.95306 |
| Dim.50 | 0.1910025 | 0.0119974 | 99.96505 |
| Dim.51 | 0.1781614 | 0.0111908 | 99.97624 |
| Dim.52 | 0.1576564 | 0.0099028 | 99.98615 |
| Dim.53 | 0.1477544 | 0.0092809 | 99.99543 |
| Dim.54 | 0.0727906 | 0.0045722 | 100.00000 |
| Dim.55 | 0.0000037 | 0.0000002 | 100.00000 |
# manual scores plot
(PCA_withQCs <- PC_coord_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_withQC/PC1_withQC) +
labs(x = glue::glue("PC1: {PC1_withQC}%"),
y = glue::glue("PC2: {PC2_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data"))anno_imp_metabind_clust_log2_noQCs <- anno_imp_metabind_clust_log2 %>%
filter(Intervention != "QC")
PCA.DC_imp_metabind_clust_log2_noQCs <- PCA(anno_imp_metabind_clust_log2_noQCs, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.DC_imp_metabind_clust_log2_noQCs))##
## Call:
## PCA(X = anno_imp_metabind_clust_log2_noQCs, scale.unit = FALSE,
## quali.sup = 1:11, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 441.833 212.422 85.731 80.113 59.413 51.306 45.168
## % of var. 33.213 15.968 6.444 6.022 4.466 3.857 3.395
## Cumulative % of var. 33.213 49.180 55.625 61.647 66.113 69.970 73.365
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 34.113 29.960 27.706 25.293 19.946 18.931 17.610
## % of var. 2.564 2.252 2.083 1.901 1.499 1.423 1.324
## Cumulative % of var. 75.929 78.181 80.264 82.165 83.665 85.088 86.411
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 13.981 12.769 11.055 10.570 10.091 9.072 8.181
## % of var. 1.051 0.960 0.831 0.795 0.759 0.682 0.615
## Cumulative % of var. 87.462 88.422 89.253 90.048 90.806 91.488 92.103
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.888 7.504 6.922 6.385 5.991 5.614 5.164
## % of var. 0.593 0.564 0.520 0.480 0.450 0.422 0.388
## Cumulative % of var. 92.696 93.260 93.780 94.260 94.711 95.133 95.521
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 5.135 4.702 4.388 4.250 3.904 3.570 3.553
## % of var. 0.386 0.353 0.330 0.319 0.293 0.268 0.267
## Cumulative % of var. 95.907 96.260 96.590 96.910 97.203 97.471 97.739
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.338 3.092 2.969 2.906 2.788 2.652 2.510
## % of var. 0.251 0.232 0.223 0.218 0.210 0.199 0.189
## Cumulative % of var. 97.989 98.222 98.445 98.664 98.873 99.072 99.261
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 2.296 2.110 1.953 1.767 1.704
## % of var. 0.173 0.159 0.147 0.133 0.128
## Cumulative % of var. 99.434 99.592 99.739 99.872 100.000
##
## Individuals (the 10 first)
## Dist Dim.1
## 1 | 24.673 | -9.190
## 2 | 21.767 | -8.370
## 3 | 29.267 | -2.553
## 4 | 24.488 | -9.865
## 5 | 25.247 | -7.014
## 6 | 70.409 | 66.408
## 7 | 28.585 | -7.483
## 8 | 23.553 | -4.287
## 9 | 21.985 | -4.779
## 10 | 28.428 | -4.512
## ctr cos2
## 1 0.398 0.139 |
## 2 0.330 0.148 |
## 3 0.031 0.008 |
## 4 0.459 0.162 |
## 5 0.232 0.077 |
## 6 20.794 0.890 |
## 7 0.264 0.069 |
## 8 0.087 0.033 |
## 9 0.108 0.047 |
## 10 0.096 0.025 |
## Dim.2 ctr cos2
## 1 -7.502 0.552 0.092
## 2 -6.285 0.387 0.083
## 3 -1.485 0.022 0.003
## 4 -5.820 0.332 0.056
## 5 -4.946 0.240 0.038
## 6 2.369 0.055 0.001
## 7 -4.034 0.160 0.020
## 8 -3.661 0.131 0.024
## 9 -5.390 0.285 0.060
## 10 0.271 0.001 0.000
## Dim.3 ctr
## 1 | -9.728 2.300
## 2 | -8.551 1.777
## 3 | -0.811 0.016
## 4 | -7.909 1.520
## 5 | -5.180 0.652
## 6 | -8.327 1.685
## 7 | -9.038 1.985
## 8 | -3.914 0.372
## 9 | 5.090 0.630
## 10 | -7.668 1.429
## cos2
## 1 0.155 |
## 2 0.154 |
## 3 0.001 |
## 4 0.104 |
## 5 0.042 |
## 6 0.014 |
## 7 0.100 |
## 8 0.028 |
## 9 0.054 |
## 10 0.073 |
##
## Variables (the 10 first)
## Dim.1 ctr
## 226.9516_0.553 | -0.162 0.006
## 159.1492_0.608 | -0.083 0.002
## 175.1442_0.616 | -0.124 0.004
## 189.1684_0.616 | -0.087 0.002
## 189.16_0.615 | -0.036 0.000
## 156.0769_0.621 | -0.039 0.000
## 170.0926_0.62 | -0.208 0.010
## 136.0482_0.633 | 0.007 0.000
## 137.071_0.636 | -0.056 0.001
## 162.1126_0.642 | 0.050 0.001
## cos2 Dim.2
## 226.9516_0.553 0.088 | -0.001
## 159.1492_0.608 0.012 | -0.236
## 175.1442_0.616 0.018 | 0.419
## 189.1684_0.616 0.017 | -0.089
## 189.16_0.615 0.004 | -0.024
## 156.0769_0.621 0.002 | -0.145
## 170.0926_0.62 0.040 | -0.333
## 136.0482_0.633 0.000 | -0.078
## 137.071_0.636 0.010 | 0.125
## 162.1126_0.642 0.002 | -0.412
## ctr cos2
## 226.9516_0.553 0.000 0.000 |
## 159.1492_0.608 0.026 0.094 |
## 175.1442_0.616 0.083 0.207 |
## 189.1684_0.616 0.004 0.018 |
## 189.16_0.615 0.000 0.002 |
## 156.0769_0.621 0.010 0.024 |
## 170.0926_0.62 0.052 0.103 |
## 136.0482_0.633 0.003 0.012 |
## 137.071_0.636 0.007 0.049 |
## 162.1126_0.642 0.080 0.133 |
## Dim.3 ctr cos2
## 226.9516_0.553 -0.044 0.002 0.006
## 159.1492_0.608 -0.091 0.010 0.014
## 175.1442_0.616 -0.022 0.001 0.001
## 189.1684_0.616 0.060 0.004 0.008
## 189.16_0.615 -0.105 0.013 0.031
## 156.0769_0.621 -0.179 0.037 0.036
## 170.0926_0.62 -0.104 0.013 0.010
## 136.0482_0.633 -0.025 0.001 0.001
## 137.071_0.636 -0.003 0.000 0.000
## 162.1126_0.642 -0.265 0.082 0.055
##
## 226.9516_0.553 |
## 159.1492_0.608 |
## 175.1442_0.616 |
## 189.1684_0.616 |
## 189.16_0.615 |
## 156.0769_0.621 |
## 170.0926_0.62 |
## 136.0482_0.633 |
## 137.071_0.636 |
## 162.1126_0.642 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1
## 6101_U1_C18POS_59 | 24.673 | -9.190
## 6101_U2_C18POS_30 | 31.022 | -7.697
## 6101_U3_C18POS_29_1 | 23.104 | -8.563
## 6101_U4_C18POS_14 | 24.577 | -9.044
## 6102_U1_C18POS_26_1 | 21.767 | -8.370
## 6102_U2_C18POS_16 | 23.410 | -7.332
## 6102_U3_C18POS_48 | 33.928 | -7.323
## 6102_U4_C18POS_50 | 23.082 | -7.190
## 6103_U1_C18POS_21 | 29.267 | -2.553
## 6103_U2_C18POS_60 | 31.321 | -7.237
## cos2 v.test
## 6101_U1_C18POS_59 0.139 -0.437 |
## 6101_U2_C18POS_30 0.062 -0.366 |
## 6101_U3_C18POS_29_1 0.137 -0.407 |
## 6101_U4_C18POS_14 0.135 -0.430 |
## 6102_U1_C18POS_26_1 0.148 -0.398 |
## 6102_U2_C18POS_16 0.098 -0.349 |
## 6102_U3_C18POS_48 0.047 -0.348 |
## 6102_U4_C18POS_50 0.097 -0.342 |
## 6103_U1_C18POS_21 0.008 -0.121 |
## 6103_U2_C18POS_60 0.053 -0.344 |
## Dim.2 cos2 v.test
## 6101_U1_C18POS_59 -7.502 0.092 -0.515
## 6101_U2_C18POS_30 -8.210 0.070 -0.563
## 6101_U3_C18POS_29_1 -6.971 0.091 -0.478
## 6101_U4_C18POS_14 -6.798 0.077 -0.466
## 6102_U1_C18POS_26_1 -6.285 0.083 -0.431
## 6102_U2_C18POS_16 -6.343 0.073 -0.435
## 6102_U3_C18POS_48 -3.185 0.009 -0.219
## 6102_U4_C18POS_50 -4.348 0.035 -0.298
## 6103_U1_C18POS_21 -1.485 0.003 -0.102
## 6103_U2_C18POS_60 -6.156 0.039 -0.422
## Dim.3 cos2
## 6101_U1_C18POS_59 | -9.728 0.155
## 6101_U2_C18POS_30 | 8.542 0.076
## 6101_U3_C18POS_29_1 | -7.504 0.105
## 6101_U4_C18POS_14 | -8.527 0.120
## 6102_U1_C18POS_26_1 | -8.551 0.154
## 6102_U2_C18POS_16 | 11.340 0.235
## 6102_U3_C18POS_48 | -8.579 0.064
## 6102_U4_C18POS_50 | -7.807 0.114
## 6103_U1_C18POS_21 | -0.811 0.001
## 6103_U2_C18POS_60 | 14.503 0.214
## v.test
## 6101_U1_C18POS_59 -1.051 |
## 6101_U2_C18POS_30 0.923 |
## 6101_U3_C18POS_29_1 -0.810 |
## 6101_U4_C18POS_14 -0.921 |
## 6102_U1_C18POS_26_1 -0.923 |
## 6102_U2_C18POS_16 1.225 |
## 6102_U3_C18POS_48 -0.927 |
## 6102_U4_C18POS_50 -0.843 |
## 6103_U1_C18POS_21 -0.088 |
## 6103_U2_C18POS_60 1.566 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 24.673 | | | -9.190 | 0.139 | -0.437 | | | -7.502 | 0.092 | -0.515 | | | -9.728 | 0.155 | -1.051 | | |
| 6101_U2_C18POS_30 | | | 31.022 | | | -7.697 | 0.062 | -0.366 | | | -8.210 | 0.070 | -0.563 | | | 8.542 | 0.076 | 0.923 | | |
| 6101_U3_C18POS_29_1 | | | 23.104 | | | -8.563 | 0.137 | -0.407 | | | -6.971 | 0.091 | -0.478 | | | -7.504 | 0.105 | -0.810 | | |
| 6101_U4_C18POS_14 | | | 24.577 | | | -9.044 | 0.135 | -0.430 | | | -6.798 | 0.077 | -0.466 | | | -8.527 | 0.120 | -0.921 | | |
| 6102_U1_C18POS_26_1 | | | 21.767 | | | -8.370 | 0.148 | -0.398 | | | -6.285 | 0.083 | -0.431 | | | -8.551 | 0.154 | -0.923 | | |
| 6102_U2_C18POS_16 | | | 23.410 | | | -7.332 | 0.098 | -0.349 | | | -6.343 | 0.073 | -0.435 | | | 11.340 | 0.235 | 1.225 | | |
| 6102_U3_C18POS_48 | | | 33.928 | | | -7.323 | 0.047 | -0.348 | | | -3.185 | 0.009 | -0.219 | | | -8.579 | 0.064 | -0.927 | | |
| 6102_U4_C18POS_50 | | | 23.082 | | | -7.190 | 0.097 | -0.342 | | | -4.348 | 0.035 | -0.298 | | | -7.807 | 0.114 | -0.843 | | |
| 6103_U1_C18POS_21 | | | 29.267 | | | -2.553 | 0.008 | -0.121 | | | -1.485 | 0.003 | -0.102 | | | -0.811 | 0.001 | -0.088 | | |
| 6103_U2_C18POS_60 | | | 31.321 | | | -7.237 | 0.053 | -0.344 | | | -6.156 | 0.039 | -0.422 | | | 14.503 | 0.214 | 1.566 | | |
# pull PC coordinates into df
PC_coord_noQCs_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2_noQCs$ind$coord)
# bind back metadata from cols 1-10
PC_coord_noQCs_log2 <- bind_cols(anno_imp_metabind_clust_log2_noQCs[,1:11], PC_coord_noQCs_log2)
# grab some variance explained
importance_noQC <- PCA.DC_imp_metabind_clust_log2_noQCs$eig
# set variance explained with PC1, round to 2 digits
PC1_noQC <- round(importance_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_noQC <- round(importance_noQC[2,2], 2)Using FactoExtra
# plot of contributions from features to PC1
(var_contrib_noQCs_PC1 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 1,
top = 25,
title = "Var contribution to PC1: no QCs"))# plot of contributions from features to PC2
(var_contrib_noQCs_PC2 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 2,
top = 25,
title = "Var contribution to PC2: no QCs"))(PCA_withoutQCs <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gold", "tomato1")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, without QCs"))(PCA_withoutQCs.pre_post <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed, without QCs"))From looking at the PCA plots, subject 6106 and 6112 are visual outliers. Let’s remove them.
# go back to wide data
DC_imp_nooutliers_log2 <- anno_imp_metabind_clust_log2 %>%
filter(Subject != 6106,
Subject != 6112)
PCA.DC_imp_nooutliers_log2 <- PCA(DC_imp_nooutliers_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
summary(PCA.DC_imp_nooutliers_log2)##
## Call:
## PCA(X = DC_imp_nooutliers_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 628.012 80.003 76.073 56.124 49.726 40.573 34.933
## % of var. 50.664 6.454 6.137 4.528 4.012 3.273 2.818
## Cumulative % of var. 50.664 57.119 63.256 67.783 71.795 75.068 77.886
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 27.643 26.037 24.397 18.719 17.907 12.987 12.775
## % of var. 2.230 2.101 1.968 1.510 1.445 1.048 1.031
## Cumulative % of var. 80.117 82.217 84.185 85.695 87.140 88.188 89.218
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 10.779 10.439 8.938 8.527 7.977 7.597 7.114
## % of var. 0.870 0.842 0.721 0.688 0.643 0.613 0.574
## Cumulative % of var. 90.088 90.930 91.651 92.339 92.982 93.595 94.169
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.444 5.977 5.503 5.196 5.152 4.836 4.582
## % of var. 0.520 0.482 0.444 0.419 0.416 0.390 0.370
## Cumulative % of var. 94.689 95.171 95.615 96.035 96.450 96.840 97.210
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.162 3.588 3.311 3.170 2.926 2.889 2.644
## % of var. 0.336 0.289 0.267 0.256 0.236 0.233 0.213
## Cumulative % of var. 97.546 97.835 98.102 98.358 98.594 98.827 99.040
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 2.452 2.321 2.064 2.012 1.763 0.395 0.230
## % of var. 0.198 0.187 0.167 0.162 0.142 0.032 0.019
## Cumulative % of var. 99.238 99.425 99.592 99.754 99.896 99.928 99.947
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 0.210 0.187 0.174 0.088 0.000
## % of var. 0.017 0.015 0.014 0.007 0.000
## Cumulative % of var. 99.964 99.979 99.993 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1
## 1 | 27.391 | -17.588
## 2 | 25.234 | -17.617
## 3 | 29.723 | -6.298
## 4 | 26.846 | -16.311
## 5 | 27.493 | -14.359
## 6 | 30.688 | -14.378
## 7 | 25.533 | -11.501
## 8 | 23.059 | -9.879
## 9 | 30.043 | -10.063
## 10 | 25.813 | -9.788
## ctr cos2
## 1 1.026 0.412 |
## 2 1.030 0.487 |
## 3 0.132 0.045 |
## 4 0.883 0.369 |
## 5 0.684 0.273 |
## 6 0.686 0.219 |
## 7 0.439 0.203 |
## 8 0.324 0.184 |
## 9 0.336 0.112 |
## 10 0.318 0.144 |
## Dim.2 ctr
## 1 4.065 0.430
## 2 2.005 0.105
## 3 -9.076 2.145
## 4 -0.254 0.002
## 5 -0.246 0.002
## 6 11.126 3.224
## 7 -2.691 0.189
## 8 -4.698 0.575
## 9 -0.348 0.003
## 10 -1.289 0.043
## cos2 Dim.3
## 1 0.022 | 6.867
## 2 0.006 | 6.771
## 3 0.093 | 6.240
## 4 0.000 | 7.255
## 5 0.000 | 4.425
## 6 0.131 | 4.961
## 7 0.011 | 5.715
## 8 0.042 | -4.735
## 9 0.000 | 9.683
## 10 0.002 | 4.413
## ctr cos2
## 1 1.291 0.063 |
## 2 1.255 0.072 |
## 3 1.066 0.044 |
## 4 1.441 0.073 |
## 5 0.536 0.026 |
## 6 0.674 0.026 |
## 7 0.894 0.050 |
## 8 0.614 0.042 |
## 9 2.567 0.104 |
## 10 0.533 0.029 |
##
## Variables (the 10 first)
## Dim.1 ctr
## 226.9516_0.553 | 0.018 0.000
## 159.1492_0.608 | -0.630 0.063
## 175.1442_0.616 | 0.098 0.002
## 189.1684_0.616 | -0.010 0.000
## 189.16_0.615 | -0.012 0.000
## 156.0769_0.621 | 0.016 0.000
## 170.0926_0.62 | 0.016 0.000
## 136.0482_0.633 | 0.016 0.000
## 137.071_0.636 | 0.071 0.001
## 162.1126_0.642 | 0.160 0.004
## cos2 Dim.2
## 226.9516_0.553 0.001 | 0.002
## 159.1492_0.608 0.468 | 0.219
## 175.1442_0.616 0.016 | 0.271
## 189.1684_0.616 0.000 | 0.157
## 189.16_0.615 0.001 | 0.015
## 156.0769_0.621 0.000 | 0.030
## 170.0926_0.62 0.000 | 0.291
## 136.0482_0.633 0.001 | -0.017
## 137.071_0.636 0.019 | 0.078
## 162.1126_0.642 0.029 | 0.196
## ctr cos2
## 226.9516_0.553 0.000 0.000 |
## 159.1492_0.608 0.060 0.057 |
## 175.1442_0.616 0.092 0.125 |
## 189.1684_0.616 0.031 0.062 |
## 189.16_0.615 0.000 0.001 |
## 156.0769_0.621 0.001 0.001 |
## 170.0926_0.62 0.106 0.112 |
## 136.0482_0.633 0.000 0.001 |
## 137.071_0.636 0.008 0.023 |
## 162.1126_0.642 0.048 0.044 |
## Dim.3 ctr cos2
## 226.9516_0.553 0.023 0.001 0.002
## 159.1492_0.608 -0.008 0.000 0.000
## 175.1442_0.616 -0.103 0.014 0.018
## 189.1684_0.616 -0.194 0.050 0.095
## 189.16_0.615 0.009 0.000 0.000
## 156.0769_0.621 0.067 0.006 0.006
## 170.0926_0.62 -0.194 0.050 0.050
## 136.0482_0.633 0.057 0.004 0.007
## 137.071_0.636 -0.084 0.009 0.027
## 162.1126_0.642 0.085 0.009 0.008
##
## 226.9516_0.553 |
## 159.1492_0.608 |
## 175.1442_0.616 |
## 189.1684_0.616 |
## 189.16_0.615 |
## 156.0769_0.621 |
## 170.0926_0.62 |
## 136.0482_0.633 |
## 137.071_0.636 |
## 162.1126_0.642 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1
## sample_ID_6101_U1_C18POS_59 | 27.391 | -17.588
## sample_ID_6101_U2_C18POS_30 | 31.486 | -11.643
## sample_ID_6101_U3_C18POS_29_1 | 25.769 | -16.428
## sample_ID_6101_U4_C18POS_14 | 26.832 | -16.086
## sample_ID_6102_U1_C18POS_26_1 | 25.234 | -17.617
## sample_ID_6102_U2_C18POS_16 | 24.321 | -11.377
## sample_ID_6102_U3_C18POS_48 | 33.883 | -8.330
## sample_ID_6102_U4_C18POS_50 | 24.519 | -12.113
## sample_ID_6103_U1_C18POS_21 | 29.723 | -6.298
## sample_ID_6103_U2_C18POS_60 | 31.457 | -9.662
## cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.412 -0.702 |
## sample_ID_6101_U2_C18POS_30 0.137 -0.465 |
## sample_ID_6101_U3_C18POS_29_1 0.406 -0.656 |
## sample_ID_6101_U4_C18POS_14 0.359 -0.642 |
## sample_ID_6102_U1_C18POS_26_1 0.487 -0.703 |
## sample_ID_6102_U2_C18POS_16 0.219 -0.454 |
## sample_ID_6102_U3_C18POS_48 0.060 -0.332 |
## sample_ID_6102_U4_C18POS_50 0.244 -0.483 |
## sample_ID_6103_U1_C18POS_21 0.045 -0.251 |
## sample_ID_6103_U2_C18POS_60 0.094 -0.386 |
## Dim.2 cos2
## sample_ID_6101_U1_C18POS_59 4.065 0.022
## sample_ID_6101_U2_C18POS_30 -0.871 0.001
## sample_ID_6101_U3_C18POS_29_1 2.637 0.010
## sample_ID_6101_U4_C18POS_14 0.501 0.000
## sample_ID_6102_U1_C18POS_26_1 2.005 0.006
## sample_ID_6102_U2_C18POS_16 -2.726 0.013
## sample_ID_6102_U3_C18POS_48 2.023 0.004
## sample_ID_6102_U4_C18POS_50 -0.591 0.001
## sample_ID_6103_U1_C18POS_21 -9.076 0.093
## sample_ID_6103_U2_C18POS_60 -12.027 0.146
## v.test Dim.3
## sample_ID_6101_U1_C18POS_59 0.454 | 6.867
## sample_ID_6101_U2_C18POS_30 -0.097 | -9.024
## sample_ID_6101_U3_C18POS_29_1 0.295 | 5.131
## sample_ID_6101_U4_C18POS_14 0.056 | 7.761
## sample_ID_6102_U1_C18POS_26_1 0.224 | 6.771
## sample_ID_6102_U2_C18POS_16 -0.305 | -10.800
## sample_ID_6102_U3_C18POS_48 0.226 | 8.833
## sample_ID_6102_U4_C18POS_50 -0.066 | 8.896
## sample_ID_6103_U1_C18POS_21 -1.015 | 6.240
## sample_ID_6103_U2_C18POS_60 -1.345 | -9.620
## cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.063 0.787 |
## sample_ID_6101_U2_C18POS_30 0.082 -1.035 |
## sample_ID_6101_U3_C18POS_29_1 0.040 0.588 |
## sample_ID_6101_U4_C18POS_14 0.084 0.890 |
## sample_ID_6102_U1_C18POS_26_1 0.072 0.776 |
## sample_ID_6102_U2_C18POS_16 0.197 -1.238 |
## sample_ID_6102_U3_C18POS_48 0.068 1.013 |
## sample_ID_6102_U4_C18POS_50 0.132 1.020 |
## sample_ID_6103_U1_C18POS_21 0.044 0.715 |
## sample_ID_6103_U2_C18POS_60 0.094 -1.103 |
# pull PC coordinates into df
PC_nooutliers_QC_log2 <- as.data.frame(PCA.DC_imp_nooutliers_log2$ind$coord)
# bind back metadata from cols 1-11
PC_nooutliers_QC_log2 <- bind_cols(DC_imp_nooutliers_log2[,1:11], PC_nooutliers_QC_log2)
# grab some variance explained
importance_nooutliers_QC <- PCA.DC_imp_nooutliers_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_withQC <- round(importance_nooutliers_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_withQC <- round(importance_nooutliers_QC[2,2], 2)Using FactoExtra package
##### Red vs yellow
# manual scores plot
(PCA_nooutliers_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_prepost_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot"))imp_nooutliers_noQCs_log2 <- DC_imp_nooutliers_log2 %>%
filter(Intervention != "QC")
PCA.imp_nooutliers_noQCs_log2 <- PCA(imp_nooutliers_noQCs_log2, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.imp_nooutliers_noQCs_log2))##
## Call:
## PCA(X = imp_nooutliers_noQCs_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 96.261 93.644 67.815 61.286 51.306 44.034 33.471
## % of var. 12.866 12.516 9.064 8.191 6.857 5.885 4.474
## Cumulative % of var. 12.866 25.382 34.446 42.637 49.495 55.380 59.854
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 31.911 29.752 22.786 21.924 17.835 15.445 13.502
## % of var. 4.265 3.977 3.046 2.930 2.384 2.064 1.805
## Cumulative % of var. 64.119 68.096 71.141 74.071 76.455 78.520 80.324
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.614 10.861 10.236 9.571 9.324 8.759 7.869
## % of var. 1.686 1.452 1.368 1.279 1.246 1.171 1.052
## Cumulative % of var. 82.010 83.462 84.830 86.109 87.355 88.526 89.578
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.213 6.699 6.235 6.198 5.803 5.513 5.094
## % of var. 0.964 0.895 0.833 0.828 0.776 0.737 0.681
## Cumulative % of var. 90.542 91.437 92.271 93.099 93.875 94.612 95.293
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.361 3.973 3.855 3.538 3.508 3.182 2.942
## % of var. 0.583 0.531 0.515 0.473 0.469 0.425 0.393
## Cumulative % of var. 95.875 96.406 96.922 97.395 97.863 98.289 98.682
## Dim.36 Dim.37 Dim.38 Dim.39
## Variance 2.810 2.480 2.416 2.156
## % of var. 0.376 0.331 0.323 0.288
## Cumulative % of var. 99.058 99.389 99.712 100.000
##
## Individuals (the 10 first)
## Dist Dim.1
## 1 | 22.177 | 1.491
## 2 | 19.368 | -0.355
## 3 | 29.393 | -9.461
## 4 | 22.105 | -2.507
## 5 | 23.759 | -1.703
## 6 | 27.369 | 9.306
## 7 | 22.789 | -3.784
## 8 | 20.866 | -3.634
## 9 | 28.364 | -2.372
## 10 | 23.978 | -2.290
## ctr cos2
## 1 0.058 0.005 |
## 2 0.003 0.000 |
## 3 2.325 0.104 |
## 4 0.163 0.013 |
## 5 0.075 0.005 |
## 6 2.249 0.116 |
## 7 0.372 0.028 |
## 8 0.343 0.030 |
## 9 0.146 0.007 |
## 10 0.136 0.009 |
## Dim.2 ctr cos2
## 1 -9.589 2.455 0.187
## 2 -8.720 2.030 0.203
## 3 -2.323 0.144 0.006
## 4 -8.338 1.856 0.142
## 5 -5.259 0.738 0.049
## 6 -8.210 1.799 0.090
## 7 -4.671 0.583 0.042
## 8 5.419 0.784 0.067
## 9 -9.050 2.187 0.102
## 10 -4.105 0.450 0.029
## Dim.3 ctr
## 1 | -4.758 0.835
## 2 | -4.139 0.632
## 3 | 1.185 0.052
## 4 | -2.817 0.293
## 5 | -1.906 0.134
## 6 | 2.124 0.166
## 7 | 2.880 0.306
## 8 | -2.610 0.251
## 9 | 4.774 0.840
## 10 | -4.969 0.910
## cos2
## 1 0.046 |
## 2 0.046 |
## 3 0.002 |
## 4 0.016 |
## 5 0.006 |
## 6 0.006 |
## 7 0.016 |
## 8 0.016 |
## 9 0.028 |
## 10 0.043 |
##
## Variables (the 10 first)
## Dim.1 ctr
## 226.9516_0.553 | -0.013 0.000
## 159.1492_0.608 | 0.218 0.049
## 175.1442_0.616 | 0.318 0.105
## 189.1684_0.616 | 0.210 0.046
## 189.16_0.615 | 0.001 0.000
## 156.0769_0.621 | -0.004 0.000
## 170.0926_0.62 | 0.335 0.117
## 136.0482_0.633 | -0.035 0.001
## 137.071_0.636 | 0.101 0.011
## 162.1126_0.642 | 0.193 0.039
## cos2 Dim.2
## 226.9516_0.553 0.001 | -0.048
## 159.1492_0.608 0.091 | -0.081
## 175.1442_0.616 0.146 | 0.050
## 189.1684_0.616 0.092 | 0.157
## 189.16_0.615 0.000 | -0.043
## 156.0769_0.621 0.000 | -0.123
## 170.0926_0.62 0.125 | 0.084
## 136.0482_0.633 0.002 | -0.062
## 137.071_0.636 0.033 | 0.064
## 162.1126_0.642 0.037 | -0.129
## ctr cos2
## 226.9516_0.553 0.002 0.008 |
## 159.1492_0.608 0.007 0.013 |
## 175.1442_0.616 0.003 0.004 |
## 189.1684_0.616 0.026 0.052 |
## 189.16_0.615 0.002 0.005 |
## 156.0769_0.621 0.016 0.017 |
## 170.0926_0.62 0.008 0.008 |
## 136.0482_0.633 0.004 0.007 |
## 137.071_0.636 0.004 0.013 |
## 162.1126_0.642 0.018 0.016 |
## Dim.3 ctr cos2
## 226.9516_0.553 -0.312 0.143 0.341
## 159.1492_0.608 0.222 0.073 0.094
## 175.1442_0.616 0.163 0.039 0.038
## 189.1684_0.616 0.088 0.012 0.016
## 189.16_0.615 -0.069 0.007 0.014
## 156.0769_0.621 0.063 0.006 0.004
## 170.0926_0.62 -0.207 0.063 0.048
## 136.0482_0.633 -0.339 0.169 0.197
## 137.071_0.636 -0.102 0.015 0.034
## 162.1126_0.642 0.124 0.023 0.015
##
## 226.9516_0.553 |
## 159.1492_0.608 |
## 175.1442_0.616 |
## 189.1684_0.616 |
## 189.16_0.615 |
## 156.0769_0.621 |
## 170.0926_0.62 |
## 136.0482_0.633 |
## 137.071_0.636 |
## 162.1126_0.642 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1
## 6101_U1_C18POS_59 | 22.177 | 1.491
## 6101_U2_C18POS_30 | 29.285 | 0.688
## 6101_U3_C18POS_29_1 | 20.734 | 0.603
## 6101_U4_C18POS_14 | 22.205 | -1.904
## 6102_U1_C18POS_26_1 | 19.368 | -0.355
## 6102_U2_C18POS_16 | 21.448 | -0.518
## 6102_U3_C18POS_48 | 32.995 | 0.188
## 6102_U4_C18POS_50 | 21.415 | -2.611
## 6103_U1_C18POS_21 | 29.393 | -9.461
## 6103_U2_C18POS_60 | 29.874 | -9.327
## cos2 v.test
## 6101_U1_C18POS_59 0.005 0.152 |
## 6101_U2_C18POS_30 0.001 0.070 |
## 6101_U3_C18POS_29_1 0.001 0.061 |
## 6101_U4_C18POS_14 0.007 -0.194 |
## 6102_U1_C18POS_26_1 0.000 -0.036 |
## 6102_U2_C18POS_16 0.001 -0.053 |
## 6102_U3_C18POS_48 0.000 0.019 |
## 6102_U4_C18POS_50 0.015 -0.266 |
## 6103_U1_C18POS_21 0.104 -0.964 |
## 6103_U2_C18POS_60 0.097 -0.951 |
## Dim.2 cos2 v.test
## 6101_U1_C18POS_59 -9.589 0.187 -0.991
## 6101_U2_C18POS_30 7.942 0.074 0.821
## 6101_U3_C18POS_29_1 -7.336 0.125 -0.758
## 6101_U4_C18POS_14 -9.051 0.166 -0.935
## 6102_U1_C18POS_26_1 -8.720 0.203 -0.901
## 6102_U2_C18POS_16 10.544 0.242 1.090
## 6102_U3_C18POS_48 -8.643 0.069 -0.893
## 6102_U4_C18POS_50 -8.582 0.161 -0.887
## 6103_U1_C18POS_21 -2.323 0.006 -0.240
## 6103_U2_C18POS_60 12.634 0.179 1.306
## Dim.3 cos2
## 6101_U1_C18POS_59 | -4.758 0.046
## 6101_U2_C18POS_30 | 4.097 0.020
## 6101_U3_C18POS_29_1 | -6.740 0.106
## 6101_U4_C18POS_14 | -1.348 0.004
## 6102_U1_C18POS_26_1 | -4.139 0.046
## 6102_U2_C18POS_16 | 7.046 0.108
## 6102_U3_C18POS_48 | 2.545 0.006
## 6102_U4_C18POS_50 | 7.651 0.128
## 6103_U1_C18POS_21 | 1.185 0.002
## 6103_U2_C18POS_60 | 7.989 0.072
## v.test
## 6101_U1_C18POS_59 -0.578 |
## 6101_U2_C18POS_30 0.497 |
## 6101_U3_C18POS_29_1 -0.818 |
## 6101_U4_C18POS_14 -0.164 |
## 6102_U1_C18POS_26_1 -0.503 |
## 6102_U2_C18POS_16 0.856 |
## 6102_U3_C18POS_48 0.309 |
## 6102_U4_C18POS_50 0.929 |
## 6103_U1_C18POS_21 0.144 |
## 6103_U2_C18POS_60 0.970 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 22.177 | | | 1.491 | 0.005 | 0.152 | | | -9.589 | 0.187 | -0.991 | | | -4.758 | 0.046 | -0.578 | | |
| 6101_U2_C18POS_30 | | | 29.285 | | | 0.688 | 0.001 | 0.070 | | | 7.942 | 0.074 | 0.821 | | | 4.097 | 0.020 | 0.497 | | |
| 6101_U3_C18POS_29_1 | | | 20.734 | | | 0.603 | 0.001 | 0.061 | | | -7.336 | 0.125 | -0.758 | | | -6.740 | 0.106 | -0.818 | | |
| 6101_U4_C18POS_14 | | | 22.205 | | | -1.904 | 0.007 | -0.194 | | | -9.051 | 0.166 | -0.935 | | | -1.348 | 0.004 | -0.164 | | |
| 6102_U1_C18POS_26_1 | | | 19.368 | | | -0.355 | 0.000 | -0.036 | | | -8.720 | 0.203 | -0.901 | | | -4.139 | 0.046 | -0.503 | | |
| 6102_U2_C18POS_16 | | | 21.448 | | | -0.518 | 0.001 | -0.053 | | | 10.544 | 0.242 | 1.090 | | | 7.046 | 0.108 | 0.856 | | |
| 6102_U3_C18POS_48 | | | 32.995 | | | 0.188 | 0.000 | 0.019 | | | -8.643 | 0.069 | -0.893 | | | 2.545 | 0.006 | 0.309 | | |
| 6102_U4_C18POS_50 | | | 21.415 | | | -2.611 | 0.015 | -0.266 | | | -8.582 | 0.161 | -0.887 | | | 7.651 | 0.128 | 0.929 | | |
| 6103_U1_C18POS_21 | | | 29.393 | | | -9.461 | 0.104 | -0.964 | | | -2.323 | 0.006 | -0.240 | | | 1.185 | 0.002 | 0.144 | | |
| 6103_U2_C18POS_60 | | | 29.874 | | | -9.327 | 0.097 | -0.951 | | | 12.634 | 0.179 | 1.306 | | | 7.989 | 0.072 | 0.970 | | |
# pull PC coordinates into df
PC_coord_nooutliers_noQC_log2 <- as.data.frame(PCA.imp_nooutliers_noQCs_log2$ind$coord)
# bind back metadata from cols 1-11
PC_coord_nooutliers_noQC_log2 <- bind_cols(imp_nooutliers_noQCs_log2[,1:11], PC_coord_nooutliers_noQC_log2)
# grab some variance explained
importance_nooutliers_noQC <- PCA.imp_nooutliers_noQCs_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_noQC <- round(importance_nooutliers_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_noQC <- round(importance_nooutliers_noQC[2,2], 2)Using FactoExtra
# plot of contributions from features to PC1
(var_contrib_nooutliers_noQCs_PC1 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 1,
top = 20,
title = "Var contribution to PC1: no outliers, no QCs"))# plot of contributions from features to PC2
(var_contrib_nooutliers_noQCs_PC2 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 2,
top = 20,
title = "Var contribution to PC2: no outliers, no QCs"))(PCA_nooutliers_withoutQCs <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("tomato1", "gold")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_noQCs.prepost <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no outliers"))(PCA_nooutliers_noQCs.MvsF <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(Sex, levels = c("M","F")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("green", "pink")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no 6106"))# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_outliers_forPCAtools <- as.data.frame(t(anno_imp_metabind_clust_log2[,-c(2:11)])) # transpose df (using df with key)
names(imp_clust_omicsdata_outliers_forPCAtools) <- imp_clust_omicsdata_outliers_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_outliers_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[-1,] # remove sample ID row
# create metadata df suitable for PCAtools pckg
metadata_outliers_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_outliers_forPCAtools <- match(rownames(metadata_outliers_forPCAtools), colnames(imp_clust_omicsdata_outliers_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_outliers_reordered_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[ ,order_outliers_forPCAtools]
# change abundance df to numeric and change name to for consistency! (using df that is already log2 transformed as of 8/26/24)
log2_abundata_outliers_reordered_forPCAtools <-
abundata_outliers_reordered_forPCAtools %>%
mutate_all(as.numeric)
# unite pre_post column with intervention column to create pre_intervention column
metadata_outliers_forPCAtools <- metadata_outliers_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)(PCAtools_outliers <- biplot(p_outliers,
lab = paste0(metadata_outliers_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with Loadings",
subtitle = "Log2 transformed data, c18 (+), without QCs but with outliers /n99% Confidence Ellipses",
ellipse = TRUE,
ellipseType = 't', # assumes multivariate
ellipseLevel = 0.99,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 0.5,
xlim = c(-10, 10),
ylim = c(-10, 10),
showLoadings = FALSE))6106 and 6112 are outside of the 99% confidence interval, we will classify them as true outliers moving forward.
# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_forPCAtools <- as.data.frame(t(anno_imp_metabind_clust_log2[,-c(2:11)])) # transpose df
names(imp_clust_omicsdata_forPCAtools) <- imp_clust_omicsdata_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools[-1,] # remove sample ID row
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools %>%
dplyr::select(!contains("QC")) # remove QC observations
# create metadata df suitable for PCAtools pckg
metadata_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_forPCAtools <- match(rownames(metadata_forPCAtools), colnames(imp_clust_omicsdata_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_reordered_forPCAtools <- imp_clust_omicsdata_forPCAtools[ ,order_forPCAtools]
# change abundance df to numeric
log2_abundata_reordered_forPCAtools <- abundata_reordered_forPCAtools %>%
mutate_at(1:ncol(.), as.numeric)
# fix rownames to have keys again
rownames(log2_abundata_reordered_forPCAtools) <- rownames(abundata_reordered_forPCAtools)
# remove outlier subj from both df
log2_abundata_forPCAtools <- log2_abundata_reordered_forPCAtools %>%
dplyr::select(!contains("6106")) %>%
dplyr::select(!contains("6112"))
metadata_forPCAtools <- metadata_forPCAtools %>%
filter(Subject != 6106,
Subject != 6112)
# unite pre_post column with intervention column to create pre_intervention column
metadata_forPCAtools <- metadata_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)Horn’s parallel analysis
## [1] 9
# pca
p <- PCAtools::pca(log2_abundata_forPCAtools,
metadata = metadata_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)Elbow method
## PC5
## 5
screeplot(p,
components = getComponents(p, 1:20),
vline = c(horn$n, elbow)) +
geom_label(aes(x = horn$n + 1, y = 50,
label = 'Horn\'s', vjust = -1, size = 8)) +
geom_label(aes(x = elbow + 1, y = 50,
label = 'Elbow method', vjust = -3, size = 8))How many PCs do we need to capture at least 80% variance?
## PC14
## 14
biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
colLegendTitle = "Intervention Timepoint",
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.99,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 1.0,
xlim = c(-10,10), ylim = c(-10, 10),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, c18 (+), outliers 6106 and 6112 removed, no QCs \n99% confidence level ellipses")(PCA.colby.prevspost <- biplot(p,
lab = NULL,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, c18 (+), outliers 6106 and 6112 removed, no QCs",
showLoadings = F) )biplot(p,
lab = NULL,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, c18 (+), outliers 6106 and 6112 removed, no QCs",
showLoadings = T)(PCA_pairsplot.colby.prevspost <-
pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')))(PCA.colby.Sex <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.overall.prevspost <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post',
colkey = c("pre" = "gray",
"post" = "darkgreen"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post',
colkey = c("pre" = "gray",
"post" = "darkgreen"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))Here is a good check for any period effects
(PCA.colby.period <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Period',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Period',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))Also looking for sequence effects
(PCA.colby.sequence <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'sequence',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject)))) pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'sequence',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))This is a cool way to explore the correlations between the metadata and the PCs! I want to look at how the metavariables correlate with PCs that account for 80% variation in the dataset.
Again: How many PCs do we need to capture at least 80% variance?
## PC14
## 14
eigencorplot(p,
components = getComponents(p, 1:16), # get components that account for 80% variance
metavars = colnames(metadata_forPCAtools),
col = c('darkblue', 'blue2', 'gray', 'red2', 'darkred'),
cexCorval = 0.7,
colCorval = 'white',
fontCorval = 2,
posLab = 'bottomleft',
rotLabX = 45,
posColKey = 'top',
cexLabColKey = 1.5,
scale = TRUE,
main = 'PC1-14 metadata correlations',
colFrame = 'white',
plotRsquared = FALSE) eigencorplot(p,
components = getComponents(p, 1:16),
metavars = colnames(metadata_forPCAtools),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2,
fontCorval = 2,
posLab = 'all',
rotLabX = 45,
scale = TRUE,
main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ metadata ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))I am most interested in PCs affected by pre_post_intervention, so I think it would be good to investigate the metabolites that contribute the most to these PCs.
# loadings plot for PCs 7 and 8
plotloadings(p,
components = getComponents(p, c(7,8)),
rangeRetain = 0.1, absolute = TRUE,
col = c('black', 'pink', 'red4'),
drawConnectors = TRUE, labSize = 3,
title = "Loadings plot",
subtitle = "PC 7 and PC 8",
caption = "Pre_post_intervention is strongly correlated with these PCs without multiple testing correction")biplot(p,
lab = NULL,
x = "PC7",
y = "PC8",
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"),
colLegendTitle = "Intervention Timepoint",
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, c18 (+), outliers removed, no QCs \n95% confidence level ellipses",
showLoadings = TRUE)Data_forMPCA <- anno_imp_metabind_clust_log2_noQCs %>%
mutate_at("Subject", as.factor)
summary(as.factor(Data_forMPCA$Subject))## 6101 6102 6103 6104 6105 6106 6108 6109 6110 6111 6112 6113
## 4 4 4 4 4 4 4 4 4 4 4 4
## [1] "sample_ID" "subj_period" "Subject"
## [4] "Period" "pre_post_intervention" "Intervention"
## [7] "pre_post" "sequence" "Intervention_week"
## [10] "Sex" "Age"
Remove post-red for 6112
Data_forMPCA_no6112postRed <- Data_forMPCA %>%
filter(!((Subject == 6112) & pre_post_intervention == "post_Red"))mixOmicsPCA.result <- mixOmics::pca(Data_forMPCA_no6112postRed[,!names(Data_forMPCA_no6112postRed) %in% metavar],
scale = FALSE,
center = T)
plotIndiv(mixOmicsPCA.result,
ind.names = Data_forMPCA_no6112postRed$Subject,
group = Data_forMPCA_no6112postRed$pre_post_intervention,
col.per.group = c("post_Red" = "darkred",
"post_Yellow" = "yellow3",
"pre_Red" = "rosybrown",
"post_Yellow" = "lemonchiffon2"),
legend.title = "Intervention Timepoint",
legend = TRUE,
title = 'Regular PCA, c18 (+), Log2 transformed')multilevelPCA.result <- mixOmics::pca(Data_forMPCA_no6112postRed[,-(c(1:11))],
multilevel = Data_forMPCA_no6112postRed$Subject,
scale = FALSE,
center = TRUE)plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA_no6112postRed$Subject,
group = Data_forMPCA_no6112postRed$pre_post_intervention,
col.per.group = c("post_Red" = "darkred",
"post_Yellow" = "yellow3",
"pre_Red" = "rosybrown",
"post_Yellow" = "lemonchiffon2"),
legend = TRUE,
legend.title = "Intervention Timepoint",
title = 'Multilevel PCA, c18 (+), Log2 transformed')(loadings_multilevelPC1 <- plotLoadings(multilevelPCA.result, comp = 1,
ndisplay = 15,
title = "Top 15 features on Multilevel PC1, c18 (+)"))## importance
## 1 0.34486490
## 2 0.30829322
## 3 0.30706419
## 4 0.27885928
## 5 0.27522294
## 6 0.27467644
## 7 0.25562287
## 8 0.23114642
## 9 0.22320071
## 10 0.21509897
## 11 0.20597482
## 12 0.20570818
## 13 0.10473433
## 14 0.09582281
## 15 -0.08974764
(loadings_multilevelPC2 <- plotLoadings(multilevelPCA.result, comp = 2,
ndisplay = 10,
title = "Top 10 features on Multilevel PC2, c18 (+)"))## importance
## 1 -0.2347686
## 2 -0.1981823
## 3 -0.1605787
## 4 -0.1566305
## 5 -0.1497762
## 6 -0.1483517
## 7 -0.1441631
## 8 -0.1381850
## 9 -0.1354448
## 10 -0.1337519
multilevelPCA_scores <- multilevelPCA.result$variates$X %>% # retrieve scores
as.data.frame() %>%
mutate(sample_ID = Data_forMPCA_no6112postRed$sample_ID)
# join with metadata
multilevelPCA_points <- left_join(multilevelPCA_scores, Data_forMPCA_no6112postRed[,1:11]) # visualize!
(plot_multilevelPCA_manual <- multilevelPCA_points %>%
ggplot(aes(x = PC1, y = PC2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("lemonchiffon1",
"yellow2",
"rosybrown2",
"darkred")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = "PC1, 12% variation",
y = "PC2, 9% variation",
fill = "Timepoint",
title = "Multilevel Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, c18 (+) without outlier subjects"))# retrieve loadings
multilevelPCA_loadings <- multilevelPCA.result$loadings$X %>%
as.data.frame() %>%
rownames_to_column("mz_rt")multilevelPCA_loadings %>%
filter(PC1 %in% loadings_multilevelPC1$importance
| PC2 %in% loadings_multilevelPC2$importance) %>%
ggplot(aes(x = PC1, y = PC2, label = mz_rt)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_point() +
geom_label_repel(size = 2.5) +
scale_fill_brewer() +
theme_minimal() +
labs( x = "PC1, 12% variation",
y = "PC2, 9% variation",
title = "Loadings Plot for Multilevel PCA",
subtitle = "c18 (+) \nSubj 6112 at post Tomato-Soy timepoint removed",)# fx to normalize scores so that scores and loadings are on the same scale
normalize <- function(x) return((x - min(x))/(max(x) - min(x)))# normalize scores
multilevelPCA_scores_normalized <- multilevelPCA_scores %>%
mutate(PC1_norm = scale(normalize(PC1), center = TRUE, scale = FALSE)) %>%
mutate(PC2_norm = scale(normalize(PC2), center = TRUE, scale = FALSE)) %>%
select(sample_ID, PC1_norm, PC2_norm, everything()) # reorder How did it go? PC1_norm and PC2_norm should all now be between -1 and 1
## sample_ID PC1_norm PC2_norm PC1 PC2
## 1 6101_U1_C18POS_59 -0.2138453 -0.05379039 -5.911001 -1.3153606
## 2 6102_U1_C18POS_26_1 -0.2342625 0.20610802 -6.475361 5.0400525
## 3 6103_U1_C18POS_21 -0.1608187 -0.01839624 -4.445267 -0.4498515
## 4 6104_U1_C18POS_55 -0.1608947 0.06093929 -4.447366 1.4901760
## 5 6105_U1_C18POS_43 -0.1647088 -0.34637962 -4.552794 -8.4701773
## 6 6106_U1_C18POS_23 -0.2701061 -0.41273960 -7.466132 -10.0929078
Now we can plot together the scores and loadings in one plot.
# join with metadata
multilevelPCA_points_norm <- left_join(multilevelPCA_scores_normalized, Data_forMPCA_no6112postRed[,1:11]) multilevelPCA_points_norm %>%
ggplot() +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_point(aes(x = PC1_norm, y = PC2_norm,
fill = factor(pre_post_intervention,
levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red"))),
inherit.aes = F,
shape = 21) +
geom_segment(data = multilevelPCA_loadings %>%
filter(PC1 %in% loadings_multilevelPC1$importance
| PC2 %in% loadings_multilevelPC2$importance),
aes(x = 0, y = 0,
xend = (PC1*4), yend = (PC2*4)),
arrow = arrow(length = unit(1/2, "picas")),
color = "black", alpha=0.55) +
geom_label_repel(data = multilevelPCA_loadings %>%
filter(PC1 %in% loadings_multilevelPC1$importance
| PC2 %in% loadings_multilevelPC2$importance),
aes(x = PC1*4.6, y = PC2*5, label = mz_rt),
size = 2.5, segment.color = "transparent",
direction = "both", max.overlaps = 15) +
scale_fill_manual(values = c("lemonchiffon1",
"yellow2",
"rosybrown2",
"darkred")) +
theme_minimal() +
labs(x = "PC1, 12% variation",
y = "PC2, 9% variation",
title = "Biplot for Multilevel PCA",
subtitle = "c18 (+) \nSubj 6112 at post Tomato-Soy timepoint removed",
fill = "Timepoint",
caption = "")multilevelPCA_points_norm %>%
ggplot() +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_point(aes(x = PC1_norm, y = PC2_norm,
fill = factor(pre_post_intervention,
levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red"))),
inherit.aes = F,
shape = 21) +
geom_label_repel(data = multilevelPCA_loadings %>%
filter(PC1 %in% loadings_multilevelPC1$importance
| PC2 %in% loadings_multilevelPC2$importance),
aes(x = PC1*4.6, y = PC2*5, label = mz_rt),
size = 2.5, segment.color = "transparent",
direction = "both", max.overlaps = 15) +
scale_fill_manual(values = c("lemonchiffon1",
"yellow2",
"rosybrown2",
"darkred"),
labels = c("pre control",
"post control",
"pre tomato-soy",
"post tomato-soy")) +
theme_minimal() +
labs(x = "PC1, 12% variation",
y = "PC2, 9% variation",
title = "Biplot for Multilevel PCA",
subtitle = "c18 (+) \nSubj 6112 at post Tomato-Soy timepoint removed",
caption = "",
fill = "Timepoint")multilevelPCA.result <- mixOmics::pca(Data_forMPCA[,-(c(1:11))],
multilevel = Data_forMPCA$Subject,
scale = FALSE,
center = FALSE)
plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post,
legend = TRUE,
legend.title = "Treatment",
title = 'Multilevel PCA, c18 (+), Log2 transformed')use tidy data
## # A tibble: 6 × 18
## sample_ID subj_period Subject Period pre_post_intervention Intervention
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## 2 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## 3 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## 4 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## 5 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## 6 6101_U1_C18POS_… 6101_U1 6101 U1 pre_Red Red
## # ℹ 12 more variables: pre_post <chr>, sequence <chr>, Intervention_week <chr>,
## # Sex <chr>, Age <chr>, BMI <chr>, mz_rt <chr>, rel_abund_log2 <dbl>,
## # annotation <chr>, metabolite_class <chr>, parent_compound <chr>,
## # Feature_ID <chr>
# remove QCs
df_for_stats <- anno_imp_metabind_clust_tidy_log2 %>%
filter(Intervention != "QC")
# check if QCs were removed
unique(df_for_stats$Intervention)## [1] "Red" "Yellow"
# df without outliers
df_for_stats_noOutlier <- df_for_stats %>%
filter(Subject != "6106",
Subject != "6112")
# check if outlier was removed
unique(df_for_stats_noOutlier$Subject)## [1] "6101" "6102" "6103" "6104" "6105" "6108" "6109" "6110" "6111" "6113"
anova_outpout_df <- df_for_stats_noOutlier %>%
dplyr::select(Subject, pre_post_intervention, Feature_ID, rel_abund_log2) %>%
group_by(Feature_ID) %>%
anova_test(rel_abund_log2 ~ pre_post_intervention, wid = Subject,
detailed = TRUE) %>%
adjust_pvalue(method = "BH") %>%
as.data.frame()
anova_sig <- anova_outpout_df %>%
filter(p.adj <= 0.05)
# how many significant features?
nrow(anova_sig)## [1] 20
# tukey's posthoc
tukey_anova <- df_for_stats_noOutlier %>%
dplyr::select(Subject, pre_post_intervention, Feature_ID, rel_abund_log2) %>%
group_by(Feature_ID) %>%
tukey_hsd(rel_abund_log2 ~ pre_post_intervention, wid = subject)df_for_stats_noOutlier %>%
filter(Feature_ID %in% anova_sig$Feature_ID) %>%
ggplot(aes(x = factor(pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow",
"pre_Red", "post_Red")),
y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), linewidth = 0.2) +
facet_wrap(vars(Feature_ID), scales = "free_y") +
theme_clean()ANOVA_heatmap_data <- anno_imp_metabind_clust_log2_noQCs %>%
unite("Subject_pre_post_intervention", Subject, pre_post_intervention, sep = "_", remove = FALSE) %>%
dplyr::select(sample_ID, Subject, Subject_pre_post_intervention, pre_post_intervention, all_of(anova_sig$Feature_ID)) %>%
column_to_rownames("sample_ID")
ANOVA_heatmap <-
pheatmap(t(ANOVA_heatmap_data[,-c(1:3)]),
scale = "row",
cluster_rows = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
labels_col = ANOVA_heatmap_data$Subject_pre_post_intervention,
color = colorRampPalette(c("#67a9cf", "#f7f7f7", "#ef8a62"))(16),
main = "Heatmap of features significant across timepoints \nby repeated measures one-way ANOVA \nBenjamoni-Hochberg corrected p-values > 0.05 \nc18 (+)")ANOVA_heatmap_data2 <- anno_imp_metabind_clust_log2_noQCs %>%
filter(Subject != 6106,
Subject != 6112) %>%
unite("Subject_pre_post_intervention", Subject, pre_post_intervention, sep = "_", remove = FALSE) %>%
dplyr::select(sample_ID, Subject_pre_post_intervention, pre_post_intervention, all_of(anova_sig$Feature_ID)) %>%
column_to_rownames("sample_ID") # i need wide df
df_for_stats_noOutlier_wide <- df_for_stats_noOutlier %>%
select(1:11, Feature_ID, rel_abund_log2) %>%
pivot_wider(names_from = "Feature_ID",
values_from = "rel_abund_log2")
# change pre_post_intervention to factor
df_for_stats_noOutlier_wide$pre_post_intervention <- as.factor(df_for_stats_noOutlier_wide$pre_post_intervention)
# create annotation rows for treatment and wrangle
# select sample col from heatmap metadata (also ensures the order is correct)
anno_trt_row <- as.data.frame(rownames(ANOVA_heatmap_data2))
# pull desired columns
anno_trt_row$pre_post_intervention <- ANOVA_heatmap_data2$pre_post_intervention
# select trt
anno_trt_row <- anno_trt_row %>%
dplyr::select(pre_post_intervention)
# get rownames to match heatmap
rownames(anno_trt_row) <- rownames(ANOVA_heatmap_data2)
# create annotation colors
annotation_colors <- list(pre_post_intervention = c("pre_Yellow" = "lemonchiffon1",
"post_Yellow" = "yellow2",
"pre_Red" = "rosybrown2",
"post_Red" = "darkred"))(ANOVA_heatmap <-
pheatmap(t(ANOVA_heatmap_data2[,-c(1:2)]),
scale = "row",
cluster_rows = TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
labels_col = ANOVA_heatmap_data2$Subject_pre_post_intervention,
color = colorRampPalette(c("#67a9cf", "#f7f7f7", "#ef8a62"))(16),
annotation_col = anno_trt_row,
annotation_colors = annotation_colors,
annotation_names_col = F,
main = "Heatmap of features significant across timepoints \nby repeated measures one-way ANOVA \nBenjamoni-Hochberg corrected p-values > 0.05 \nRPLC-MS (+)"))Here, I am comparing pre- to post-intervention for both yellow and tomato soy (Red) juice interventions. I also compare post-yellow to post-red intervention. I am using the log transformed values of rel abundance since parametric tests assume normality.
# run paired t-tests for control intervention
ctrl_t.test_paired <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_t.test_paired_sig <- ctrl_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_t.test_paired_sig)## # A tibble: 148 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 100.0756_0.… rel_… post pre 12 12 -2.33 11 4.02e-2 *
## 2 111.044_2.9… rel_… post pre 12 12 -2.40 11 3.55e-2 *
## 3 114.0669_0.… rel_… post pre 12 12 -2.43 11 3.35e-2 *
## 4 132.0768_0.… rel_… post pre 12 12 -3.63 11 3.98e-3 **
## 5 136.0482_0.… rel_… post pre 12 12 3.28 11 7.3 e-3 **
## 6 141.0659_0.… rel_… post pre 12 12 -2.50 11 2.92e-2 *
## 7 142.0862_0.… rel_… post pre 12 12 -3.26 11 7.57e-3 **
## 8 149.0598_7.… rel_… post pre 12 12 2.56 11 2.65e-2 *
## 9 156.0769_0.… rel_… post pre 12 12 -4.69 11 6.63e-4 ***
## 10 159.1492_0.… rel_… post pre 12 12 -2.65 11 2.24e-2 *
## # ℹ 138 more rows
## [1] 148
# run paired t-tests for control intervention
ctrl_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_noOutlier_t.test_paired_sig <- ctrl_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_noOutlier_t.test_paired_sig)## # A tibble: 98 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 132.0768_0.… rel_… post pre 10 10 -3.15 9 0.0117 *
## 2 136.0482_0.… rel_… post pre 10 10 2.58 9 0.0296 *
## 3 142.0862_0.… rel_… post pre 10 10 -2.82 9 0.02 *
## 4 144.1283_0.… rel_… post pre 10 10 2.33 9 0.0448 *
## 5 156.0769_0.… rel_… post pre 10 10 -4.13 9 0.00255 **
## 6 162.1126_0.… rel_… post pre 10 10 -3.12 9 0.0124 *
## 7 163.1243_0.… rel_… post pre 10 10 -3.98 9 0.00322 **
## 8 166.0862_0.… rel_… post pre 10 10 -3.49 9 0.0068 **
## 9 170.0926_0.… rel_… post pre 10 10 -2.98 9 0.0155 *
## 10 174.1237_2.… rel_… post pre 10 10 4.06 9 0.00283 **
## # ℹ 88 more rows
## [1] 98
# run paired t-tests for control intervention
red_t.test_paired <- df_for_stats %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_t.test_paired_sig <- red_t.test_paired %>%
filter(p <= 0.05)
tibble(red_t.test_paired_sig)## # A tibble: 79 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 111.044_2.9… rel_… post pre 12 12 -2.27 11 0.0439 *
## 2 121.0284_3.… rel_… post pre 12 12 3.24 11 0.00789 **
## 3 135.0441_3.… rel_… post pre 12 12 -3.16 11 0.00908 **
## 4 144.1023_0.… rel_… post pre 12 12 -3.21 11 0.00824 **
## 5 145.0648_5.… rel_… post pre 12 12 -3.31 11 0.0069 **
## 6 160.0967_0.… rel_… post pre 12 12 -3.80 11 0.00295 **
## 7 166.0863_2.… rel_… post pre 12 12 -2.44 11 0.0328 *
## 8 170.0447_0.… rel_… post pre 12 12 2.70 11 0.0207 *
## 9 170.0448_2.… rel_… post pre 12 12 2.74 11 0.0194 *
## 10 190.0499_3.… rel_… post pre 12 12 -2.73 11 0.0196 *
## # ℹ 69 more rows
## [1] 79
# run paired t-tests for control intervention
red_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_noOutlier_t.test_paired_sig <- red_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(red_noOutlier_t.test_paired_sig)## # A tibble: 69 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 121.0284_3.… rel_… post pre 10 10 3.51 9 0.00665 **
## 2 121.0646_4.… rel_… post pre 10 10 3.45 9 0.0073 **
## 3 135.0441_3.… rel_… post pre 10 10 -2.76 9 0.022 *
## 4 138.0551_0.… rel_… post pre 10 10 2.76 9 0.0223 *
## 5 144.1023_0.… rel_… post pre 10 10 -2.95 9 0.0162 *
## 6 145.0648_5.… rel_… post pre 10 10 -2.66 9 0.0259 *
## 7 160.0967_0.… rel_… post pre 10 10 -3.76 9 0.0045 **
## 8 166.0863_2.… rel_… post pre 10 10 -2.29 9 0.0481 *
## 9 190.0499_3.… rel_… post pre 10 10 -2.37 9 0.0417 *
## 10 196.0604_3.… rel_… post pre 10 10 3.25 9 0.00993 **
## # ℹ 59 more rows
## [1] 69
# run paired t-tests for post interventions
post_t.test_paired <- df_for_stats %>%
filter(pre_post == "post") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_t.test_paired_sig <- post_t.test_paired %>%
filter(p <= 0.05)
tibble(post_t.test_paired_sig)## # A tibble: 28 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 203.139_0.6… rel_… Red Yellow 12 12 2.33 11 3.99e-2 *
## 2 234.1122_4.… rel_… Red Yellow 12 12 -6.51 11 4.38e-5 ****
## 3 250.1107_3.… rel_… Red Yellow 12 12 -4.16 11 1.59e-3 **
## 4 255.0655_5.… rel_… Red Yellow 12 12 10.2 11 5.95e-7 ****
## 5 271.0596_4.… rel_… Red Yellow 12 12 13.3 11 4.1 e-8 ****
## 6 271.1656_4.… rel_… Red Yellow 12 12 2.33 11 3.98e-2 *
## 7 274.1833_5.… rel_… Red Yellow 12 12 2.36 11 3.8 e-2 *
## 8 277.1432_7.… rel_… Red Yellow 12 12 3.90 11 2.47e-3 **
## 9 292.1575_3.… rel_… Red Yellow 12 12 -2.25 11 4.57e-2 *
## 10 302.1959_0.… rel_… Red Yellow 12 12 2.71 11 2.02e-2 *
## # ℹ 18 more rows
## [1] 28
# run paired t-tests for post interventions
post_noOutlier_t.test_paired <- df_for_stats %>%
filter(pre_post == "post",
Subject != "6106") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_noOutlier_t.test_paired_sig <- post_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(post_noOutlier_t.test_paired_sig)## # A tibble: 32 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 142.0862_0.… rel_… Red Yellow 11 11 2.56 10 2.83e-2 *
## 2 159.1492_0.… rel_… Red Yellow 11 11 2.98 10 1.39e-2 *
## 3 160.0967_0.… rel_… Red Yellow 11 11 -2.24 10 4.86e-2 *
## 4 219.1705_0.… rel_… Red Yellow 11 11 -2.36 10 3.97e-2 *
## 5 230.9902_0.… rel_… Red Yellow 11 11 -2.56 10 2.86e-2 *
## 6 234.1122_4.… rel_… Red Yellow 11 11 -4.55 10 1.05e-3 **
## 7 250.1107_3.… rel_… Red Yellow 11 11 -2.82 10 1.8 e-2 *
## 8 255.0655_5.… rel_… Red Yellow 11 11 9.28 10 3.15e-6 ****
## 9 265.1278_3.… rel_… Red Yellow 11 11 2.28 10 4.56e-2 *
## 10 271.0596_4.… rel_… Red Yellow 11 11 11.6 10 4.08e-7 ****
## # ℹ 22 more rows
## [1] 32
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yel_volcano_data_noOutlier <- df_for_stats %>%
filter(pre_post == "post",
Subject != 6106,
Subject != 6112) %>% # remove outlier subj
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(2^rel_abund_log2)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yel_tobind_noOutlier <- post_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yel_volcano_data_noOutlier <-
bind_cols(red_v_yel_volcano_data_noOutlier, red_v_yel_tobind_noOutlier) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-Red
postRed_sig_red_v_yel_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 0.6)
# create a df of features passing FC and pval cutoffs higher in post-control
postYellow_sig_red_v_yel_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -0.6) Create feature lists with significant features along with their clusters
#post-Red list
postRed_sig_intrvntn_comp_clusters <- left_join(postRed_sig_red_v_yel_volcano_noOutlier, cluster_features,
by = "mz_rt")
write_csv(postRed_sig_intrvntn_comp_clusters,
"Feature lists/postRed-sigfeatures-intervntn-comp.csv")
#post-Yellow list
postYellow_sig_intrvntn_comp_clusters <- left_join(postYellow_sig_red_v_yel_volcano_noOutlier, cluster_features,
by = "mz_rt")
write_csv(postYellow_sig_intrvntn_comp_clusters,
"Feature lists/postYellow-sigfeatures-intervntn-comp.csv")(red_v_yellow_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = postRed_sig_intrvntn_comp_clusters,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = postYellow_sig_intrvntn_comp_clusters,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption. Subject 6106 removed",
caption = "Vertical dashed lines represent a log2 fold change > 0.6 or < -0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly_noOutlier <- ggplotly(red_v_yellow_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano_noOutlier,
filename = "plots and figures/volcano plots/red_v_yellow_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly_noOutlier,
file = "plots and figures/volcano plots/interactive_redvyellow_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Red",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(2^rel_abund_log2)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind_noOutlier <- red_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data_noOutlier <-
bind_cols(red_volcano_data_noOutlier, red_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = log2(FC_post_div_pre),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_pre_v_post_volcano_noOutlier <- red_volcano_data_noOutlier %>%
filter(p <= 0.05 & abs(log2_FC_post_div_pre) >= 0.6)Create feature lists with significant features along with their clusters
(red_volcano_noOutlier <- red_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_sig_prepost_comp_clusters,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 6)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption",
caption = "Vertical dashed lines represent an abs(log fold change) > 0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))Save plots
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Yellow",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(2^rel_abund_log2)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind_noOutlier <- ctrl_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data_noOutlier <-
bind_cols(yel_volcano_data_noOutlier, yel_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = log2(FC_post_div_pre),
neglog10p = -log10(p))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yel_pre_v_post_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & abs(log2_FC_post_div_pre) >= 0.6)Create feature lists with significant features along with their clusters
(yel_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yel_sig_prepost_comp_clusters,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 0.6, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -0.6, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-6, 6)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption.\nSubject 6106 removed",
caption = "Vertical dashed lines represent abs(log2 fold change) > 0.6, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))Save plots
Here, I want to venn significant features before I begin to look more into them. I am interested in the following effects: tomato effect, lycopene and/or soy isoflavones effect.
# keep only features present in both pre vs post red and pre vs post yellow
tomato_effect <- inner_join(red_sig_prepost_comp_clusters,
yel_sig_prepost_comp_clusters,
by = "mz_rt")
dim(tomato_effect)## [1] 3 25
Export venned list
# add in Feature ID annotations
anno_tomato_effect <- left_join(tomato_effect, df_for_stats_noOutlier, by = "mz_rt")
# metabs with pval < 0.05 and fc >= 1.51
sigmetabs_tomato_effect <- anno_tomato_effect$Feature_IDtomato_effect_df_for_stats_wide <- df_for_stats_noOutlier_wide %>%
dplyr::select(c(1:11),
all_of(sigmetabs_tomato_effect))
# make tidy df
tomato_effect_df_for_stats_tidy <- tomato_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
tomato_effect_df_for_stats_tidy$pre_post_intervention <- factor(tomato_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(tomato_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
tomato_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_clean() Let’s make them prettier.
my_comparisons <- list( c("pre_Yellow", "post_Yellow"), c("pre_Red", "post_Red") )
(bps_tomatoeffect_metabs <- tomato_effect_df_for_stats_tidy %>%
ggpaired(x = "pre_post_intervention", y = "rel_abund_log2",
fill = "Intervention", line.color = "gray",
id = "Subject", line.size = 0.15) +
scale_fill_manual(values = c("Red" = "tomato1",
"Yellow" = "yellow1"),
labels = c("Tomato-Soy", "Control")) +
geom_point() +
scale_x_discrete(labels = c("pre", "post", "pre", "post")) +
labs(x = "",
y = "Log2 relative abundance",
title = "") +
facet_wrap( ~ mz_rt, ncol = 3, labeller = labeller(mz_rt = label_wrap_gen(13))) +
stat_compare_means(comparisons = my_comparisons, method = "t.test", paired = TRUE, p.adjust.method = "BH", label = "p.signif") +
theme_classic(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 0)))# combine sig features from post-red vs post-yellow comparison
sig_postintervention_noOutlier <- full_join(postRed_sig_intrvntn_comp_clusters,
postYellow_sig_intrvntn_comp_clusters)
dim(sig_postintervention_noOutlier)## [1] 27 13
# select only sig features that are present when comparing pre-Red to post-Red
lyc_soy_effect <- inner_join(sig_postintervention_noOutlier,
red_sig_prepost_comp_clusters,
by = "mz_rt")
dim(lyc_soy_effect)## [1] 22 25
Export venned list
# add in Feature ID annotations
anno_lycsoy_effect <- left_join(lyc_soy_effect, df_for_stats_noOutlier, by = "mz_rt")
# metabs with pval < 0.05 and fc >= 1.51
sigmetabs_lycsoy_effect <- anno_lycsoy_effect$Feature_IDlycsoy_effect_df_for_stats_wide <- df_for_stats_noOutlier_wide %>%
dplyr::select(c(1:11),
all_of(sigmetabs_lycsoy_effect))
# make tidy df
lycsoy_effect_df_for_stats_tidy <- lycsoy_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
lycsoy_effect_df_for_stats_tidy$pre_post_intervention <- factor(lycsoy_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(lycsoy_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
lycsoy_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_clean() ## [1] 27 13
# select only sig features that are present when comparing pre-Yellow to post-Yellow
low_carot_tomato_effect <- inner_join(sig_postintervention_noOutlier,
yel_sig_prepost_comp_clusters,
by = "mz_rt")
dim(low_carot_tomato_effect)## [1] 6 25
Export venned list
# add in Feature ID annotations
anno_lowcarot_effect <- left_join(low_carot_tomato_effect, df_for_stats_noOutlier, by = "mz_rt")
# metabs with pval < 0.05 and fc >= 1.51
sigmetabs_lowcarot_effect <- anno_lowcarot_effect$Feature_IDyellow_effect_df_for_stats_wide <- df_for_stats_noOutlier_wide %>%
dplyr::select(c(1:11),
all_of(sigmetabs_lowcarot_effect))
# make tidy df
yellow_effect_df_for_stats_tidy <- yellow_effect_df_for_stats_wide %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")# changing factor levels for pre_post_intervention
yellow_effect_df_for_stats_tidy$pre_post_intervention <- factor(yellow_effect_df_for_stats_tidy$pre_post_intervention,
levels = c("pre_Yellow", "post_Yellow", "pre_Red", "post_Red"))
levels(yellow_effect_df_for_stats_tidy$pre_post_intervention) ## [1] "pre_Yellow" "post_Yellow" "pre_Red" "post_Red"
yellow_effect_df_for_stats_tidy %>%
ggplot(aes(x = pre_post_intervention, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(outlier.shape = NA) +
scale_fill_manual(values = c("Yellow" = "gold",
"Red" = "tomato1")) +
geom_line(aes(group = Subject), fill = "brown", linewidth = 0.2) +
facet_wrap(vars(mz_rt), scales = "free_y") +
theme_clean()